public ENS scRNAseq data analysis

this Rmarkdown is the final version for RAISIN Smart-Seq2 data analysis

first load all needed packages/functions which are integrated in script 'analysis.r'
most of the script are from example code https://github.com/cssmillie/ulcerative_colitis, and modified to fit local environment and current datasets

source("analysis.r")

metadata processing

allmeta <- read.table(paste0(raw_dir,"all.meta.txt"), header = T, sep = "\t")
allmeta <- allmeta[2:nrow(allmeta),]
dim(allmeta)
## [1] 586460     16
cat(paste0("cells with nGene > 1000: \n",sum(allmeta$nGene > 1000),
           "\n\ncells with nGene >= 1000: \n",sum(allmeta$nGene >= 1000)))
## cells with nGene > 1000: 
## 585917
## 
## cells with nGene >= 1000: 
## 586460

this allmeta has information for all filtered nuclei, total 586,460 cells (nGene >= 1000)
sub-slusters in paper are marked under column 'Annotation'

head(allmeta)
##                       NAME Age   Annotation                     Dataset
## 2 H10_S46.GGAACTTCAAGAAGAG  35 Fibroblast_2 Human colon all cells (10X)
## 3 H10_S46.CCGTACTCACCTCGGA  35   Epithelial Human colon all cells (10X)
## 4 H10_S46.GATGCTAAGAGCAATT  35 Fibroblast_1 Human colon all cells (10X)
## 5 H10_S46.CCCAGTTAGGGATCTG  35    Myocyte_3 Human colon all cells (10X)
## 6 H10_S46.GGATTACCAGGATTGG  35    Myocyte_2 Human colon all cells (10X)
## 7 H10_S46.CAGAATCTCGCGGATC  35 Fibroblast_2 Human colon all cells (10X)
##   Location_ID Mouse_ID Overload Patient_ID    Region Segment Sex Time Type
## 2     Sigmoid     <NA>       1X        H10 Myenteric    <NA>   M <NA> <NA>
## 3     Sigmoid     <NA>       1X        H10 Myenteric    <NA>   M <NA> <NA>
## 4     Sigmoid     <NA>       1X        H10 Myenteric    <NA>   M <NA> <NA>
## 5     Sigmoid     <NA>       1X        H10 Myenteric    <NA>   M <NA> <NA>
## 6     Sigmoid     <NA>       1X        H10 Myenteric    <NA>   M <NA> <NA>
## 7     Sigmoid     <NA>       1X        H10 Myenteric    <NA>   M <NA> <NA>
##   Unique_ID nGene nUMI
## 2       S46  1166 1563
## 3       S46  1554 2061
## 4       S46  1335 1760
## 5       S46  1008 1367
## 6       S46  1066 1398
## 7       S46  1761 2437

column names of allmeta

colnames(allmeta)
##  [1] "NAME"        "Age"         "Annotation"  "Dataset"     "Location_ID"
##  [6] "Mouse_ID"    "Overload"    "Patient_ID"  "Region"      "Segment"    
## [11] "Sex"         "Time"        "Type"        "Unique_ID"   "nGene"      
## [16] "nUMI"

using 'Dataset' could get a stat for each dataset

data.frame(table(allmeta$Dataset))
##                                        Var1   Freq
## 1               Human colon all cells (10X) 146442
## 2            Human colon enteric glia (10X)   6054
## 3         Human colon enteric neurons (10X)   1445
## 4               Mouse colon all cells (10X) 343000
## 5            Mouse colon enteric glia (10X)   1690
## 6     Mouse colon enteric glia (Smart-Seq2)   3039
## 7         Mouse colon enteric neurons (10X)   1938
## 8  Mouse colon enteric neurons (Smart-Seq2)   2657
## 9               Mouse ileum all cells (10X)  79293
## 10           Mouse ileum enteric glia (10X)    429
## 11        Mouse ileum enteric neurons (10X)    473

ss2 subset

how about Smart-Seq2 datasets

# for this small amount of lines, could directly use
# data.frame(table(allmeta$Dataset))[c(6,8),]
data.frame(table(allmeta$Dataset))[grep("Smart-Seq2",as.character(data.frame(table(allmeta$Dataset))$Var1)), ]
##                                       Var1 Freq
## 6    Mouse colon enteric glia (Smart-Seq2) 3039
## 8 Mouse colon enteric neurons (Smart-Seq2) 2657

these '3039' glia and '2657' neurons of Mouse colon were first labeled in mice, then FACS sorted into plates,
so no 'all cells' as other droplet-datasets

subset allmeta to _ss2

allmeta_ss2 <- allmeta[allmeta$Dataset %in% c("Mouse colon enteric glia (Smart-Seq2)", "Mouse colon enteric neurons (Smart-Seq2)"),]

# continue to partition
allmeta_ss2_neur <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric neurons (Smart-Seq2)"),]
allmeta_ss2_glia <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric glia (Smart-Seq2)"),]

# check 
cat("dimention of allmeta_ss2: \n",dim(allmeta_ss2),
    "\n\ndimention of allmeta_ss2_neur: \n",dim(allmeta_ss2_neur),
    "\n\ndimention of allmeta_ss2_glia: \n",dim(allmeta_ss2_glia))
## dimention of allmeta_ss2: 
##  5696 16 
## 
## dimention of allmeta_ss2_neur: 
##  2657 16 
## 
## dimention of allmeta_ss2_glia: 
##  3039 16
head(allmeta_ss2)
##              NAME   Age Annotation                                  Dataset
## 572593 M1_S1.S339 Young     PIMN_4 Mouse colon enteric neurons (Smart-Seq2)
## 572594 M1_S1.S351 Young     PIMN_4 Mouse colon enteric neurons (Smart-Seq2)
## 572595 M1_S1.S363 Young     PIMN_4 Mouse colon enteric neurons (Smart-Seq2)
## 572596 M1_S1.S349 Young     PIMN_5 Mouse colon enteric neurons (Smart-Seq2)
## 572597 M1_S1.S313 Young     PIMN_4 Mouse colon enteric neurons (Smart-Seq2)
## 572598 M1_S1.S289 Young      PIN_1 Mouse colon enteric neurons (Smart-Seq2)
##        Location_ID Mouse_ID Overload Patient_ID Region Segment Sex Time  Type
## 572593        <NA>       M1     <NA>       <NA>   <NA>    <NA>   F <NA> Sox10
## 572594        <NA>       M1     <NA>       <NA>   <NA>    <NA>   F <NA> Sox10
## 572595        <NA>       M1     <NA>       <NA>   <NA>    <NA>   F <NA> Sox10
## 572596        <NA>       M1     <NA>       <NA>   <NA>    <NA>   F <NA> Sox10
## 572597        <NA>       M1     <NA>       <NA>   <NA>    <NA>   F <NA> Sox10
## 572598        <NA>       M1     <NA>       <NA>   <NA>    <NA>   F <NA> Sox10
##        Unique_ID nGene       nUMI
## 572593        S1  7060 1777551.53
## 572594        S1  7419 1435340.23
## 572595        S1  7674 1483155.55
## 572596        S1  8018  1505426.6
## 572597        S1  6985 1412616.17
## 572598        S1  7945 1210014.21
#tail(allmeta_ss2)
#head(allmeta_ss2_neur)
#head(allmeta_ss2_glia)

conditions recover

allmeta_ss2 is incomplete for all conditions, need to recover them from table mmc2.xlsx 'Mous Colon (plate)'

mmc2_ss2 <- read.table("../test_4_for_RAISIN/mmc2_1.csv",header=TRUE,sep = ",")
mmc2_ss2[1:10,]
##    Mouse_ID Sample_ID Age Age_Processed Model Model_Processed Segment
## 1        M1        S1  12         Young Sox10           Sox10     All
## 2        M2        S2  12         Young Sox10           Sox10     All
## 3        M3        S3  12         Young Sox10           Sox10     All
## 4        M4        S4  12         Young Sox10           Sox10     All
## 5        M5        S5  12         Young Sox10           Sox10     All
## 6        M6        S6  12         Young Sox10           Sox10     All
## 7        M7        S7  12         Young Sox10           Sox10       1
## 8        M7        S8  12         Young Sox10           Sox10       2
## 9        M7        S9  12         Young Sox10           Sox10       3
## 10       M7       S10  12         Young Sox10           Sox10       4
##    Segment_Processed Sex Sex_Processed Time Time_Processed Count_Neuron
## 1               <NA>   F             F <NA>           <NA>           10
## 2               <NA>   M             M <NA>           <NA>            4
## 3               <NA>   M             M <NA>           <NA>            4
## 4               <NA>   M             M <NA>           <NA>           23
## 5               <NA>   F             F  2PM           <NA>           14
## 6               <NA>   M             M  7AM            7AM           15
## 7           Proximal   F             F  7AM            7AM           17
## 8           Proximal   F             F  7AM            7AM           10
## 9             Distal   F             F  7AM            7AM            9
## 10            Distal   F             F  7AM            7AM           17
##    nGene_Neuron nUMI_Neuron Count_Glia nGene_Glia nUMI_Glia
## 1      7258.900   1365085.3         14   4827.071 1089717.3
## 2      7641.750   1177631.3         21   5194.619  952311.3
## 3      6442.750   1078223.6         16   5170.312 1067908.4
## 4      6966.565    905060.1         51   4833.804  709912.2
## 5      7773.357   1447690.7         54   4910.370  756925.7
## 6      7475.600   1111227.2         31   5111.355  943599.6
## 7      7585.529   1001655.7         50   5077.520  679572.9
## 8      7349.900    915390.5         32   5017.406  843839.6
## 9      7807.222   1201613.6         30   5156.567  842031.2
## 10     7602.647   1183935.2         35   5294.057  906308.8

depending on allmeta_ss2:Unique_ID = mmc_ss2:Sample_ID

# allmeta_ss2 just keep these columns, conditions all use mmc2's    
allmeta_ss2 <- allmeta_ss2[,c("NAME","Annotation","Dataset","Mouse_ID","Unique_ID","nGene","nUMI")]

# for same Sample_ID, copy mmc2 columns 3-12, i.e. Age-Time_Processed
for(i in 1:nrow(mmc2_ss2)){
  allmeta_ss2[allmeta_ss2$Unique_ID == mmc2_ss2$Sample_ID[i], colnames(mmc2_ss2)[3:12]] <- mmc2_ss2[i,3:12]
}

rownames(allmeta_ss2) <- allmeta_ss2$NAME

# repartition
allmeta_ss2_neur <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric neurons (Smart-Seq2)"),]
allmeta_ss2_glia <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric glia (Smart-Seq2)"),]

allmeta_ss2[1001:1005,]
##                      NAME Annotation                                  Dataset
## M19_S60.S304 M19_S60.S304     PIMN_1 Mouse colon enteric neurons (Smart-Seq2)
## M19_S60.S296 M19_S60.S296     PSVN_1 Mouse colon enteric neurons (Smart-Seq2)
## M19_S60.S313 M19_S60.S313     PIMN_1 Mouse colon enteric neurons (Smart-Seq2)
## M19_S60.S321 M19_S60.S321     PEMN_2 Mouse colon enteric neurons (Smart-Seq2)
## M19_S60.S318 M19_S60.S318      PIN_1 Mouse colon enteric neurons (Smart-Seq2)
##              Mouse_ID Unique_ID nGene       nUMI Age Age_Processed Model
## M19_S60.S304      M19       S60 11385 2034421.15  12         Young Uchl1
## M19_S60.S296      M19       S60  9780 2017089.15  12         Young Uchl1
## M19_S60.S313      M19       S60 10202 2249290.68  12         Young Uchl1
## M19_S60.S321      M19       S60 10170 2388613.29  12         Young Uchl1
## M19_S60.S318      M19       S60  9729 2264351.92  12         Young Uchl1
##              Model_Processed Segment Segment_Processed Sex Sex_Processed Time
## M19_S60.S304           Uchl1       4            Distal   M             M  7AM
## M19_S60.S296           Uchl1       4            Distal   M             M  7AM
## M19_S60.S313           Uchl1       4            Distal   M             M  7AM
## M19_S60.S321           Uchl1       4            Distal   M             M  7AM
## M19_S60.S318           Uchl1       4            Distal   M             M  7AM
##              Time_Processed
## M19_S60.S304            7AM
## M19_S60.S296            7AM
## M19_S60.S313            7AM
## M19_S60.S321            7AM
## M19_S60.S318            7AM
allmeta_ss2_neur[501:505,]
##                      NAME Annotation                                  Dataset
## M13_S36.S171 M13_S36.S171     PIMN_6 Mouse colon enteric neurons (Smart-Seq2)
## M13_S36.S124 M13_S36.S124     PIMN_6 Mouse colon enteric neurons (Smart-Seq2)
## M13_S36.S97   M13_S36.S97     PIMN_5 Mouse colon enteric neurons (Smart-Seq2)
## M13_S36.S164 M13_S36.S164      PSN_3 Mouse colon enteric neurons (Smart-Seq2)
## M13_S36.S142 M13_S36.S142     PEMN_3 Mouse colon enteric neurons (Smart-Seq2)
##              Mouse_ID Unique_ID nGene       nUMI Age Age_Processed Model
## M13_S36.S171      M13       S36  7868 1176891.69  12         Young Sox10
## M13_S36.S124      M13       S36  7279  658859.28  12         Young Sox10
## M13_S36.S97       M13       S36  8132 1621489.91  12         Young Sox10
## M13_S36.S164      M13       S36  6134  637731.59  12         Young Sox10
## M13_S36.S142      M13       S36  6827  999976.02  12         Young Sox10
##              Model_Processed Segment Segment_Processed Sex Sex_Processed Time
## M13_S36.S171           Sox10       2          Proximal   M             M  7PM
## M13_S36.S124           Sox10       2          Proximal   M             M  7PM
## M13_S36.S97            Sox10       2          Proximal   M             M  7PM
## M13_S36.S164           Sox10       2          Proximal   M             M  7PM
## M13_S36.S142           Sox10       2          Proximal   M             M  7PM
##              Time_Processed
## M13_S36.S171            7PM
## M13_S36.S124            7PM
## M13_S36.S97             7PM
## M13_S36.S164            7PM
## M13_S36.S142            7PM
allmeta_ss2_glia[1301:1305,]
##                      NAME Annotation                               Dataset
## M12_S32.S120 M12_S32.S120     Glia_1 Mouse colon enteric glia (Smart-Seq2)
## M12_S32.S109 M12_S32.S109     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M12_S32.S126 M12_S32.S126     Glia_1 Mouse colon enteric glia (Smart-Seq2)
## M12_S32.S138 M12_S32.S138     Glia_3 Mouse colon enteric glia (Smart-Seq2)
## M12_S32.S167 M12_S32.S167     Glia_1 Mouse colon enteric glia (Smart-Seq2)
##              Mouse_ID Unique_ID nGene      nUMI Age Age_Processed Model
## M12_S32.S120      M12       S32  3415 746971.26  12         Young Sox10
## M12_S32.S109      M12       S32  4856 951135.46  12         Young Sox10
## M12_S32.S126      M12       S32  4330 598552.18  12         Young Sox10
## M12_S32.S138      M12       S32  5014 637799.64  12         Young Sox10
## M12_S32.S167      M12       S32  4782 870248.93  12         Young Sox10
##              Model_Processed Segment Segment_Processed Sex Sex_Processed Time
## M12_S32.S120           Sox10       2          Proximal   M             M  7PM
## M12_S32.S109           Sox10       2          Proximal   M             M  7PM
## M12_S32.S126           Sox10       2          Proximal   M             M  7PM
## M12_S32.S138           Sox10       2          Proximal   M             M  7PM
## M12_S32.S167           Sox10       2          Proximal   M             M  7PM
##              Time_Processed
## M12_S32.S120            7PM
## M12_S32.S109            7PM
## M12_S32.S126            7PM
## M12_S32.S138            7PM
## M12_S32.S167            7PM

check the expression

load DGE matrix

ss2_neur.counts <- readMM(paste0(raw_dir,"DGE/gene_sorted-ss2.neur.matrix.mtx"))
rownames(ss2_neur.counts) <- readLines(paste0(raw_dir,"DGE/ss2.neur.genes.tsv"))
colnames(ss2_neur.counts) <- readLines(paste0(raw_dir,"DGE/ss2.neur.barcodes.tsv"))

ss2_glia.counts <- readMM(paste0(raw_dir,"DGE/gene_sorted-ss2.glia.matrix.mtx"))
rownames(ss2_glia.counts) <- readLines(paste0(raw_dir,"DGE/ss2.glia.genes.tsv"))
colnames(ss2_glia.counts) <- readLines(paste0(raw_dir,"DGE/ss2.glia.barcodes.tsv"))

check the expression

ss2_neur.counts[6:10,1500:1505]
## 5 x 6 sparse Matrix of class "dgTMatrix"
##               Atlas_M22_S71.S5 Atlas_M22_S71.S18 Atlas_M22_S71.S30
## 0610009B22Rik             .                 .                 .   
## 0610009D07Rik             .                 5.01              .   
## 0610009L18Rik             .                 .                 .   
## 0610009O20Rik             1.00             83.00              1.00
## 0610010B08Rik           328.18            153.61             30.85
##               Atlas_M22_S71.S7 Atlas_M22_S72.S161 Atlas_M22_S72.S139
## 0610009B22Rik             2.00                .                 .   
## 0610009D07Rik             1.00                .                 .   
## 0610009L18Rik             .                   .                 .   
## 0610009O20Rik            13.00              105.0               .   
## 0610010B08Rik           105.45               74.7              50.78
ss2_glia.counts[6:10,1400:1405]
## 5 x 6 sparse Matrix of class "dgTMatrix"
##               Atlas_M12_S34.S377 Atlas_M12_S34.S336 Atlas_M12_S34.S308
## 0610009B22Rik               .                  .                  .   
## 0610009D07Rik               .                  .                  1.57
## 0610009L18Rik               .                  .                  .   
## 0610009O20Rik               .                  .                  0.95
## 0610010B08Rik              10.49             399.33              12.43
##               Atlas_M12_S34.S314 Atlas_M12_S34.S315 Atlas_M12_S34.S290
## 0610009B22Rik               .                  .                 55.00
## 0610009D07Rik               .                  .                619.97
## 0610009L18Rik               .                  .                  .   
## 0610009O20Rik               .                  .                  .   
## 0610010B08Rik               2.51               4.99            1095.62

to be convenient, trying to unify NAME in allmeta and colnames of .counts, need to remove those 'Atlas_',
but get 'length not match' error, then find a few cells with 'Aging_' start instead of 'Atlas_',
and those cells are all from so-called 'old mice' with Age 52/104/105 weeks

check first 6 'Aging_' cells

ss2_neur.counts[6:10,grep("Aging_",colnames(ss2_neur.counts))[1:6]]
## 5 x 6 sparse Matrix of class "dgTMatrix"
##               Aging_M16_S45.S52 Aging_M16_S45.S14 Aging_M16_S45.S39
## 0610009B22Rik              1.00              .                 .   
## 0610009D07Rik              .                 .                 .   
## 0610009L18Rik              .                 .                 .   
## 0610009O20Rik              .                 .                 .   
## 0610010B08Rik            360.99            165.77            328.02
##               Aging_M16_S45.S83 Aging_M16_S45.S37 Aging_M16_S45.S63
## 0610009B22Rik              .                 .                 .   
## 0610009D07Rik              .                 .                 .   
## 0610009L18Rik              .                 .                 .   
## 0610009O20Rik              .                 .                 .   
## 0610010B08Rik            377.23             67.82             62.18
allmeta_ss2_neur[grep("Aging_",colnames(ss2_neur.counts))[1:6],c("Age","Age_Processed")]
##             Age Age_Processed
## M16_S45.S52  52           Old
## M16_S45.S14  52           Old
## M16_S45.S39  52           Old
## M16_S45.S83  52           Old
## M16_S45.S37  52           Old
## M16_S45.S63  52           Old

rename

remove 'Atlas_' or 'Aging_' in front of colnames of .counts
demo

colnames(ss2_neur.counts)[1:5];as.character(sapply(colnames(ss2_neur.counts)[1:5],function(x){gsub("Atlas_|Aging_","",x)}))
## [1] "Atlas_M1_S1.S339" "Atlas_M1_S1.S351" "Atlas_M1_S1.S363" "Atlas_M1_S1.S349"
## [5] "Atlas_M1_S1.S313"
## [1] "M1_S1.S339" "M1_S1.S351" "M1_S1.S363" "M1_S1.S349" "M1_S1.S313"

for all

colnames(ss2_neur.counts) <- as.character(sapply(colnames(ss2_neur.counts),function(x){gsub("Atlas_|Aging_","",x)}))
colnames(ss2_glia.counts) <- as.character(sapply(colnames(ss2_glia.counts),function(x){gsub("Atlas_|Aging_","",x)}))

check

cat("ss2_neur new cell names in allmeta_neur: \n",sum(colnames(ss2_neur.counts) %in% allmeta_ss2_neur$NAME),
    "\n\nss2_glia new cell names in allmeta_glia: \n",sum(colnames(ss2_glia.counts) %in% allmeta_ss2_glia$NAME))
## ss2_neur new cell names in allmeta_neur: 
##  2657 
## 
## ss2_glia new cell names in allmeta_glia: 
##  3039

actually they have same orders

cat("ss2_neur new cell names = NAME in allmeta_neur: \n",sum(colnames(ss2_neur.counts) == allmeta_ss2_neur$NAME),
    "\n\nss2_glia new cell names = NAME in allmeta_glia: \n",sum(colnames(ss2_glia.counts) == allmeta_ss2_glia$NAME))
## ss2_neur new cell names = NAME in allmeta_neur: 
##  2657 
## 
## ss2_glia new cell names = NAME in allmeta_glia: 
##  3039

TPM or counts ?

not sure about what's the expression value, TPM or counts? (though named by '.counts' ? the name/process is designed for droplet.)
considering it's RSEM's output which could have 'expected counts' with decimals (normal counts is integer), both could be possible,
and it should be processed for the little difference about nGene

nGene calculated from expression matrix is a little different from nGene in allmeta
guess some very small values of some genes became 0

cat("ss2_neur cells: \n",dim(ss2_neur.counts)[2],
    "\n\nss2_neur cells with 'nGene >=1000': \n",sum(colSums(ss2_neur.counts>0) >= 1000))
## ss2_neur cells: 
##  2657 
## 
## ss2_neur cells with 'nGene >=1000': 
##  2656
cat("ss2_glia cells: \n",dim(ss2_glia.counts)[2],
    "\n\nss2_glia cells with 'nGene >=1000': \n",sum(colSums(ss2_glia.counts>0) >= 1000))
## ss2_glia cells: 
##  3039 
## 
## ss2_glia cells with 'nGene >=1000': 
##  3036

and what's confused next is that there's a 'nUMI' column in allmeta_ss2
but as we know Smart-Seq2 technology is not so possible to contain UMI...

cat("total expression value of some ss2_neur cells: \n\n");colSums(ss2_neur.counts[,1500:1505])
## total expression value of some ss2_neur cells:
##   M22_S71.S5  M22_S71.S18  M22_S71.S30   M22_S71.S7 M22_S72.S161 M22_S72.S139 
##    1210587.6    1053291.2    1069731.4     616457.4    1412823.1    1106437.9

how about their 'nUMI' in meta data

allmeta_ss2_neur[1500:1505,"nUMI",drop=F]
##                    nUMI
## M22_S71.S5   1210587.56
## M22_S71.S18  1053291.16
## M22_S71.S30  1069731.41
## M22_S71.S7    616457.44
## M22_S72.S161 1412823.05
## M22_S72.S139  1106437.9
cat("total expression value of some ss2_glia cells: \n\n");colSums(ss2_glia.counts[,1400:1405])
## total expression value of some ss2_glia cells:
## M12_S34.S377 M12_S34.S336 M12_S34.S308 M12_S34.S314 M12_S34.S315 M12_S34.S290 
##     891648.1    1132662.9     588916.9     400811.4     781963.6     959742.6

how about their 'nUMI' in meta data

allmeta_ss2_glia[1400:1405,"nUMI",drop=F]
##                    nUMI
## M12_S34.S377  891648.05
## M12_S34.S336 1132662.92
## M12_S34.S308  588916.89
## M12_S34.S314  400811.42
## M12_S34.S315  781963.58
## M12_S34.S290  959742.57

CP10K but not TP10K

could see that so-called 'nUMI' is indeed sum of expression values, but not the same across samples

although in paper, ss2 always use TPM or TP10K,
here we could just regard the expression value as reads count

and without additional raw data, impossible to turn it into TPM,
so, we could just normalize it to CPM or say CP10k

# norm_f <- 1e6 
# have tried 1e6, it got much flatter and longer groups in tsne image after sub-clustering, no other considerations here  
norm_f <- 1e4
ss2_neur.counts <- sweep(norm_f*ss2_neur.counts,2,as.numeric(allmeta_ss2_neur$nUMI), FUN='/')
ss2_glia.counts <- sweep(norm_f*ss2_glia.counts,2,as.numeric(allmeta_ss2_glia$nUMI), FUN='/')

# _n as log2(CP10K+1) 
ss2_neur.counts_n <- log2(ss2_neur.counts+1)
ss2_glia.counts_n <- log2(ss2_glia.counts+1)

normalize total expression of each cell to 10k

cat("normalized total expression value of some ss2_neur cells: \n\n");colSums(ss2_neur.counts[,1500:1505])
## normalized total expression value of some ss2_neur cells:
##   M22_S71.S5  M22_S71.S18  M22_S71.S30   M22_S71.S7 M22_S72.S161 M22_S72.S139 
##        10000        10000        10000        10000        10000        10000
cat("normalized total expression value of some ss2_glia cells: \n\n");colSums(ss2_glia.counts[,1400:1405])
## normalized total expression value of some ss2_glia cells:
## M12_S34.S377 M12_S34.S336 M12_S34.S308 M12_S34.S314 M12_S34.S315 M12_S34.S290 
##        10000        10000        10000        10000        10000        10000

marker genes

check marker genes of neuron/glia from Lasrado et al. 2017

Neurons
Tubb3, Elavl4, Ret, Phox2b, Chrnb4, Eml5, Smpd3, Tagln3, Snap25, Gpr22, Gdap1l1, Stmn3,
Chrna3, Scg3, Syt4, Ncan, Crmp1, Adcyap1r1, Elavl3, Dlg2, Cacna2d(Cacna2d1/2/3/4 which ?)

Glia
Erbb3, Sox10, Fabp3, Plp1, Gas7, Nid1, Qk, Sparc, Mest, Nfia, Wwtr1, Gpm6b, Rasa3,
Flrt1, Itpripl1, Itga4, Lama4, Postn, Ptprz1, Pdpn, Col18a1, Nrcam

genes_neur <- c("Tubb3", "Elavl4", "Ret", "Phox2b", "Chrnb4", "Eml5", "Smpd3", 
                "Tagln3", "Snap25", "Gpr22", "Gdap1l1", "Stmn3", "Chrna3", "Scg3", 
                "Syt4", "Ncan", "Crmp1", "Adcyap1r1", "Elavl3", "Dlg2","Cacna2d1", "Cacna2d2")                
#               "Syt4", "Ncan", "Crmp1", "Adcyap1r1", "Elavl3", "Dlg2","Cacna2d1", "Cacna2d2", "Cacna2d3", "Cacna2d4")

genes_glia <- c("Erbb3", "Sox10", "Fabp3", "Plp1", "Gas7", "Nid1", "Qk", "Sparc", "Mest", "Nfia", "Wwtr1", 
                "Gpm6b", "Rasa3", "Flrt1", "Itpripl1", "Itga4", "Lama4", "Postn", "Ptprz1", "Pdpn", "Col18a1", "Nrcam")
pheatmap::pheatmap(cbind(ss2_neur.counts_n[c(genes_neur,genes_glia),],ss2_glia.counts_n[c(genes_neur,genes_glia),]),
         cluster_rows = F,cluster_cols = F, show_colnames = F, fontsize_row = 7.2,
         gaps_col = dim(ss2_neur.counts_n)[2], gaps_row = length(genes_neur),
         main = "Heatmap: log2(CP10K+1) expression of neur/glia marker genes\nleft: ss2_neur, right: ss2_glia;\n up: genes_neur, down: genes_glia")

# convert to signature score
sig_score_neur <- cbind(ss2_neur.counts_n[c(genes_neur),],ss2_glia.counts_n[c(genes_neur),])
sig_score_glia <- cbind(ss2_neur.counts_n[c(genes_glia),],ss2_glia.counts_n[c(genes_glia),])

sig_score_neur <- t(scale(t(sig_score_neur), center=FALSE))
sig_score_glia <- t(scale(t(sig_score_glia), center=FALSE))

scaled and mean

how about scaled value

pheatmap::pheatmap(rbind(sig_score_neur,sig_score_glia),
         cluster_rows = F,cluster_cols = F, show_colnames = F, fontsize_row = 7.2,
         gaps_col = dim(ss2_neur.counts_n)[2], gaps_row = length(genes_neur),
         main = "Heatmap: scaled log2(CP10K+1) expression of neur/glia marker genes\nleft: ss2_neur, right: ss2_glia;\n up: genes_neur, down: genes_glia")

# then take mean
sig_score_neur <- colMeans(sig_score_neur)
sig_score_glia <- colMeans(sig_score_glia)

pheatmap::pheatmap(rbind(sig_score_neur,sig_score_glia),
         cluster_rows = F,cluster_cols = F, show_colnames = F, 
         gaps_col = dim(ss2_neur.counts_n)[2], gaps_row = 1,
         main = "Heatmap: signature score of neur/glia marker genes\nleft: ss2_neur, right: ss2_glia;\n up: genes_neur, down: genes_glia")  

outliers

in general, ss2_neur and _glia are well separated by those marker genes
is there any cell different/opposite ?

cat("ss2_neur cells: \n",dim(ss2_neur.counts)[2],
"\n\nglia-like cells in ss2_neur: \n",sum(sig_score_neur[1:dim(ss2_neur.counts_n)[2]] < sig_score_glia[1:dim(ss2_neur.counts_n)[2]]),
"\n\n\nss2_glia cells: \n",dim(ss2_glia.counts)[2],
"\n\nneur-like cells in ss2_glia: \n",sum(sig_score_neur[(dim(ss2_neur.counts_n)[2]+1):(dim(ss2_neur.counts_n)[2]+dim(ss2_glia.counts_n)[2])] > 
      sig_score_glia[(dim(ss2_neur.counts_n)[2]+1):(dim(ss2_neur.counts_n)[2]+dim(ss2_glia.counts_n)[2])]))
## ss2_neur cells: 
##  2657 
## 
## glia-like cells in ss2_neur: 
##  0 
## 
## 
## ss2_glia cells: 
##  3039 
## 
## neur-like cells in ss2_glia: 
##  15

what are those cells

wrsig_cells <- rbind(sig_score_neur,sig_score_glia)[,c(sig_score_neur[1:dim(ss2_neur.counts_n)[2]] < sig_score_glia[1:dim(ss2_neur.counts_n)[2]],
                                                      sig_score_neur[(dim(ss2_neur.counts_n)[2]+1):(dim(ss2_neur.counts_n)[2]+dim(ss2_glia.counts_n)[2])] > 
                                                      sig_score_glia[(dim(ss2_neur.counts_n)[2]+1):(dim(ss2_neur.counts_n)[2]+dim(ss2_glia.counts_n)[2])])]
cat("neur-like ss2_glia :\n\n");t(wrsig_cells)
## neur-like ss2_glia :
##              sig_score_neur sig_score_glia
## M10_S25.S287      0.3956897     0.35361891
## M12_S31.S54       0.1935581     0.13908701
## M13_S35.S34       0.8062678     0.75337060
## M17_S52.S143      1.0703043     0.51475028
## M19_S59.S201      1.1289381     0.22542524
## M22_S67.S3        0.6895019     0.14439987
## M22_S67.S74       0.6765199     0.49102940
## M22_S68.S134      1.2611510     0.03127052
## M22_S73.S222      0.9624907     0.07234676
## M22_S73.S251      1.0419958     0.05229566
## M22_S73.S201      0.8180435     0.47197097
## M24_S79.S28       0.8825665     0.04041496
## M24_S80.S130      0.3680673     0.04114909
## M25_S83.S59       0.7483502     0.57859943
## M25_S85.S253      1.2196015     0.32383591
pheatmap::pheatmap(wrsig_cells,
         cluster_rows = F,cluster_cols = F, show_colnames = T, 
         display_numbers = T,
         main = "Heatmap: signature score of neur-like ss2_glia")  

pheatmap::pheatmap(cbind(ss2_neur.counts_n[c(genes_neur,genes_glia),],ss2_glia.counts_n[c(genes_neur,genes_glia),])[,colnames(wrsig_cells)],
         cluster_rows = F,cluster_cols = F, show_colnames = T, fontsize_row = 6.8,
         display_numbers = F, gaps_row = length(genes_neur),
         main = "Heatmap: log2(CP10K+1) of neur/glia marker genes for neur-like ss2_glia\n up: genes_neur, down: genes_glia")  

in 15 neur-like ss2_glia, could see about 6-7 have either very low or very close neur/glia signature score
what caused this may be i.Immune labeling, ii.FACS sorting or even iii.the List of marker genes for neur/glia

here would keep those cells, but make a mark for them

colnames(wrsig_cells)
##  [1] "M10_S25.S287" "M12_S31.S54"  "M13_S35.S34"  "M17_S52.S143" "M19_S59.S201"
##  [6] "M22_S67.S3"   "M22_S67.S74"  "M22_S68.S134" "M22_S73.S222" "M22_S73.S251"
## [11] "M22_S73.S201" "M24_S79.S28"  "M24_S80.S130" "M25_S83.S59"  "M25_S85.S253"

allmeta add neur/glia sig_score

# 
allmeta_ss2 <- cbind(allmeta_ss2,t(rbind(sig_score_neur,sig_score_glia))[rownames(allmeta_ss2),])
allmeta_ss2[,c("sig_score_neur","sig_score_glia")] <- round(allmeta_ss2[,c("sig_score_neur","sig_score_glia")],2)

# repartition neur/glia  
allmeta_ss2_neur <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric neurons (Smart-Seq2)"),]
allmeta_ss2_glia <- allmeta_ss2[allmeta_ss2$Dataset %in% c("Mouse colon enteric glia (Smart-Seq2)"),]

check meta data of neur-like ss2_glia

allmeta_ss2[colnames(wrsig_cells),]
##                      NAME Annotation                               Dataset
## M10_S25.S287 M10_S25.S287     Glia_3 Mouse colon enteric glia (Smart-Seq2)
## M12_S31.S54   M12_S31.S54     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M13_S35.S34   M13_S35.S34     Glia_3 Mouse colon enteric glia (Smart-Seq2)
## M17_S52.S143 M17_S52.S143     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M19_S59.S201 M19_S59.S201     Glia_3 Mouse colon enteric glia (Smart-Seq2)
## M22_S67.S3     M22_S67.S3     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M22_S67.S74   M22_S67.S74     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M22_S68.S134 M22_S68.S134     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M22_S73.S222 M22_S73.S222     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M22_S73.S251 M22_S73.S251     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M22_S73.S201 M22_S73.S201     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M24_S79.S28   M24_S79.S28     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M24_S80.S130 M24_S80.S130     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M25_S83.S59   M25_S83.S59     Glia_2 Mouse colon enteric glia (Smart-Seq2)
## M25_S85.S253 M25_S85.S253     Glia_2 Mouse colon enteric glia (Smart-Seq2)
##              Mouse_ID Unique_ID nGene       nUMI Age Age_Processed Model
## M10_S25.S287      M10       S25  5199 1156647.53  12         Young Sox10
## M12_S31.S54       M12       S31  2805  241888.82  12         Young Sox10
## M13_S35.S34       M13       S35  6526 1643994.84  12         Young Sox10
## M17_S52.S143      M17       S52  8069  789865.28  12         Young Uchl1
## M19_S59.S201      M19       S59  4346   613243.6  12         Young Uchl1
## M22_S67.S3        M22       S67  2758   26048.76  12         Young Uchl1
## M22_S67.S74       M22       S67  5529  355083.05  12         Young Uchl1
## M22_S68.S134      M22       S68  4123  723618.93  12         Young Uchl1
## M22_S73.S222      M22       S73  4407  860896.83  12         Young Uchl1
## M22_S73.S251      M22       S73  5105  950884.65  12         Young Uchl1
## M22_S73.S201      M22       S73  6631  705714.29  12         Young Uchl1
## M24_S79.S28       M24       S79  5519  650119.54  12         Young Uchl1
## M24_S80.S130      M24       S80  5599  608444.38  12         Young Uchl1
## M25_S83.S59       M25       S83  7141  816792.66  11         Young Uchl1
## M25_S85.S253      M25       S85  7618 1158638.51  11         Young Uchl1
##              Model_Processed Segment Segment_Processed Sex Sex_Processed Time
## M10_S25.S287           Sox10       3            Distal   M             M  7PM
## M12_S31.S54            Sox10       1          Proximal   M             M  7PM
## M13_S35.S34            Sox10       1          Proximal   M             M  7PM
## M17_S52.S143           Uchl1       2          Proximal   M             M  7AM
## M19_S59.S201           Uchl1       1          Proximal   M             M  7AM
## M22_S67.S3             Uchl1       1          Proximal   M             M  7PM
## M22_S67.S74            Uchl1       1          Proximal   M             M  7PM
## M22_S68.S134           Uchl1       2          Proximal   M             M  7PM
## M22_S73.S222           Uchl1       3            Distal   M             M  7PM
## M22_S73.S251           Uchl1       3            Distal   M             M  7PM
## M22_S73.S201           Uchl1       3            Distal   M             M  7PM
## M24_S79.S28            Uchl1       1          Proximal   F             F  7PM
## M24_S80.S130           Uchl1       2          Proximal   F             F  7PM
## M25_S83.S59            Uchl1       1          Proximal   F             F  7AM
## M25_S85.S253           Uchl1       2          Proximal   F             F  7AM
##              Time_Processed sig_score_neur sig_score_glia
## M10_S25.S287            7PM           0.40           0.35
## M12_S31.S54             7PM           0.19           0.14
## M13_S35.S34             7PM           0.81           0.75
## M17_S52.S143            7AM           1.07           0.51
## M19_S59.S201            7AM           1.13           0.23
## M22_S67.S3              7PM           0.69           0.14
## M22_S67.S74             7PM           0.68           0.49
## M22_S68.S134            7PM           1.26           0.03
## M22_S73.S222            7PM           0.96           0.07
## M22_S73.S251            7PM           1.04           0.05
## M22_S73.S201            7PM           0.82           0.47
## M24_S79.S28             7PM           0.88           0.04
## M24_S80.S130            7PM           0.37           0.04
## M25_S83.S59             7AM           0.75           0.58
## M25_S85.S253            7AM           1.22           0.32
back to datasets

check nGene

(detected genes with expression)

plot(as.numeric(allmeta_ss2$nGene), type = "p", pch = 1,ylab="nGene")
abline(h=mean(as.numeric(allmeta_ss2$nGene)),col=6)
abline(h=mean(as.numeric(allmeta_ss2_neur$nGene)),col="green3")
abline(h=mean(as.numeric(allmeta_ss2_glia$nGene)),col="blue3")
title(paste0("mean nGene for ss2 neuron(left,green) and glia(right,blue) "))

neur and glia are naturally partitioned by nGene !
but couldn't do it this way with mixed neur+glia.

h <- hist(as.numeric(allmeta_ss2$nGene), breaks = 20,xlab="", main="", xlim = c(0,12000), ylim = c(0,800), plot = T)
h1 <- hist(as.numeric(allmeta_ss2_neur$nGene), breaks = 20, xlab="", main="", xlim = c(0,12000), ylim = c(0,800), plot = F)
h2 <- hist(as.numeric(allmeta_ss2_glia$nGene), breaks = 20, xlab="", main="", xlim = c(0,12000), ylim = c(0,800), plot = F)

d <- density(as.numeric(allmeta_ss2$nGene))
d1 <- density(as.numeric(allmeta_ss2_neur$nGene))
d2 <- density(as.numeric(allmeta_ss2_glia$nGene))

#plot(h,add=T, col="grey", plot = F)
#abline(v=mean(as.numeric(allmeta_ss2$nGene)),col=6)
title(paste0("nGene distribtusion for ss2 neuron(green) and glia(blue) "))
#lines(x = d$x, y=d$y* length(as.numeric(allmeta_ss2$nGene))* diff(h$breaks)[1],col="grey",lwd=2, xlim = c(0,12000), ylim = c(0,800))
lines(x = d1$x, y=d1$y* length(as.numeric(allmeta_ss2_neur$nGene))* diff(h1$breaks)[1],col="green3",lwd=3, xlim = c(0,12000), ylim = c(0,800))
lines(x = d2$x, y=d2$y* length(as.numeric(allmeta_ss2_glia$nGene))* diff(h2$breaks)[1],col="blue3",lwd=3, xlim = c(0,12000), ylim = c(0,800))

plot(h,add=F, col="grey", xlab="", main="")
title(paste0("nGene distribtusion for mixed ss2 neuron+glia "))
lines(x = d$x, y=d$y* length(as.numeric(allmeta_ss2$nGene))* diff(h$breaks)[1],col=6, lwd=2, xlim = c(0,12000), ylim = c(0,800))

clustering mixed ss2_neur/glia

variable genes are important in batch correction and clustering.
in paper, they calculate top 1500 vargenes for each batch, merge and sort by repeating times, take top 1500 again as new vargenes
also need to 'remove HLA, immunoglobulin, ribosomal, and mitochondrial genes based on HUGO gene names', with:
var_regex = '^HLA-|^IG[HJKL]|^RNA|^MT|^RP'

here comes two questions, the first one could have big effect on sub-clustering.

to partition ss2_neur and _glia, these could be not so important,
because differences between neur and glia are big enough to ignore other effects like batch effect;
but to sub-cluster ss2_neur or _glia, batch effect could be larger than putative biological effect,
without correction the result might be first clustered by batch effects then by biological ones in each batch.

  1. what does one batch look like ?

could be 'one Mouse' or 'one Sample' or some real batches to do experiment.
guess they might do several mice at once, but we could only try to select first two kinds without additional information.

using "Sample_ID" as batch info, more batches could have very small number of cells,
and would get mixed sub-clustering results comparing to inpaper clusters (allmeta$Annotation), especially ss2_glia.

finally choose the first kind, i.g. one unique 'Mouse_ID' as one batch

if one batch only has one cell, the combat correction step would use 'mean only' method instead of considering mean and variance of cells
if one batch only has very small number of cells, it might also have bad effect on this correction (of which I'm not so sure yet)
in the previous Cell paper of Chris's (example code), they merge all small batches with cells less than 50 as one new batch, but after trying a range of 'min_cells' to merge, I just set a relatively small value to merge as few mice as possible .

  1. HUGO name is for Human gene filtering, how about Mouse gene ?

just basically use the related gene names:
var_regex='^Cd|^Ig|^Rn|^mt|^Rp'

another set of genes may have effect is cell cycle related (not mentioned in paper),
have tested that. g2m genes enrich in 3-4 ss2_neur sub-clusters,
but without cellcycle genes, only one sub-cluster of PIMNs dispeared,
so here also do clustering without consideration of cellcycle's effect.

batch info

using 'Sample_ID' as batch info

cat("profiled ss2_mix in each Mouse-Sample: \n");table(allmeta_ss2$Unique_ID)[paste0("S",1:102)]
## profiled ss2_mix in each Mouse-Sample:
## 
##   S1   S2   S3   S4   S5   S6   S7   S8   S9  S10  S11  S12  S13  S14  S15  S16 
##   24   25   20   74   68   46   67   42   39   52   77   41   32   65   55   49 
##  S17  S18  S19  S20  S21  S22  S23  S24  S25  S26  S27  S28  S29  S30  S31  S32 
##   47   78   69   65   53   68   74   62   73   44   80   74   74   56   50   69 
##  S33  S34  S35  S36  S37  S38  S39  S40  S41  S42  S43  S44  S45  S46  S47  S48 
##   65   58   72   62   68   58    1    1    5    2    3    4   77   80   61   59 
##  S49  S50  S51  S52  S53  S54  S55  S56  S57  S58  S59  S60  S61  S62  S63  S64 
##   37   26   57   65   20   22   77   76   84   82   59   22   56   82   76   63 
##  S65  S66  S67  S68  S69  S70  S71  S72  S73  S74  S75  S76  S77  S78  S79  S80 
##   41   46   78   81   86   82   86   77   81   87   78   68   73   87   83   79 
##  S81  S82  S83  S84  S85  S86  S87  S88  S89  S90  S91  S92  S93  S94  S95  S96 
##   77   79   72   71   74   82   17   51   10    2   55   63   26   38   28   61 
##  S97  S98  S99 S100 S101 S102 
##   44   39   63   61   38   40
cat("\n quantile statistics: \n");quantile(table(allmeta_ss2$Unique_ID),probs=seq(0,1,0.05))
## 
##  quantile statistics:
##    0%    5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55%   60% 
##  1.00  4.05 20.20 26.00 38.00 41.00 46.00 51.35 56.00 59.00 62.00 65.00 68.00 
##   65%   70%   75%   80%   85%   90%   95%  100% 
## 70.30 73.70 75.50 77.00 78.85 81.00 82.95 87.00

actually one sample is more like one batch as samples have more average cell number,
another choice might could be to merge neighboring mouse/sample with small number of cells

using 'Mouse_ID' as batch info

cat("profiled ss2_mix in each Mouse: \n");table(allmeta_ss2$Mouse_ID)[paste0("M",1:29)]
## profiled ss2_mix in each Mouse:
## 
##  M1  M2  M3  M4  M5  M6  M7  M8  M9 M10 M11 M12 M13 M14 M15 M16 M17 M18 M19 M20 
##  24  25  20  74  68  46 415 229 255 253 284 242 260   2  14 340 164 319  81 138 
## M21 M22 M23 M24 M25 M26 M27 M28 M29 
## 226 658 306 318 299  80 182 172 202
cat("\n quantile statistics: \n");quantile(table(allmeta_ss2$Mouse_ID),probs=seq(0,1,0.05))
## 
##  quantile statistics:
##    0%    5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55%   60% 
##   2.0  16.4  23.2  29.2  59.2  74.0  80.4 126.6 165.6 178.0 202.0 227.2 239.4 
##   65%   70%   75%   80%   85%   90%   95%  100% 
## 253.4 258.0 284.0 301.8 315.6 323.2 385.0 658.0

check min_cells to choose to merge batches

min_cells <- 20
cat("ss2_mix mice: ",length(table(allmeta_ss2$Mouse_ID)),
    "\nss2_mix count: ",sum(table(allmeta_ss2$Mouse_ID)),
    paste0("\n\nmin_cell < ",min_cells," : ",
           "\n    mice ",sum(table(allmeta_ss2$Mouse_ID) < min_cells),
           "\n    count ",sum((table(allmeta_ss2$Mouse_ID))[table(allmeta_ss2$Mouse_ID) < min_cells])))
## ss2_mix mice:  29 
## ss2_mix count:  5696 
## 
## min_cell < 20 : 
##     mice 2
##     count 16

clustering

modified parameters for Phenograph in paper:
PCs 1-16, 'k'NN 250

#ss2_mix.counts <- RowMergeSparseMatrices(ss2_neur.counts, ss2_glia.counts)
#
batch_use.ss2_mix <- as.character(allmeta_ss2$Mouse_ID)
names(batch_use.ss2_mix) <- rownames(allmeta_ss2)
batch_use.ss2_mix <- factor(batch_use.ss2_mix)

#ss2_mix.seur <- run_analysis(name="ss2_mix",
#                             counts=ss2_mix.counts,
#                             do.batch=TRUE,
#                             batch.use=batch_use.ss2_mix,
#                             min_cells=20,
#                             var_regex='^Cd|^Ig|^Rn|^mt|^Rp',
#                             num_pcs=30,
#                             clust_pcs=16,
#                             k=250)

#
#ss2_mix.seur@meta.data <- cbind(ss2_mix.seur@meta.data, allmeta_ss2[rownames(ss2_mix.seur@meta.data),])
#saveRDS(ss2_mix.seur,"ss2_mix.seur.rds")

because this kind of step takes time and everytime to run would get a little different result,
here just use same parameters to run once then annotate the code, next always use the saved seurat object for analysis

ss2_mix.seur <- readRDS("ss2_mix.seur.rds")
head(ss2_mix.seur@meta.data)
##            orig.ident nCount_RNA nFeature_RNA phenograph.k250       NAME
## M1_S1.S339    ss2_mix  10000.000         7060               3 M1_S1.S339
## M1_S1.S351    ss2_mix  10000.000         7418               3 M1_S1.S351
## M1_S1.S363    ss2_mix   9999.865         7672               3 M1_S1.S363
## M1_S1.S349    ss2_mix   9999.957         8016               3 M1_S1.S349
## M1_S1.S313    ss2_mix   9999.972         6983               3 M1_S1.S313
## M1_S1.S289    ss2_mix  10000.000         7945               5 M1_S1.S289
##            Annotation                                  Dataset Mouse_ID
## M1_S1.S339     PIMN_4 Mouse colon enteric neurons (Smart-Seq2)       M1
## M1_S1.S351     PIMN_4 Mouse colon enteric neurons (Smart-Seq2)       M1
## M1_S1.S363     PIMN_4 Mouse colon enteric neurons (Smart-Seq2)       M1
## M1_S1.S349     PIMN_5 Mouse colon enteric neurons (Smart-Seq2)       M1
## M1_S1.S313     PIMN_4 Mouse colon enteric neurons (Smart-Seq2)       M1
## M1_S1.S289      PIN_1 Mouse colon enteric neurons (Smart-Seq2)       M1
##            Unique_ID nGene       nUMI Age Age_Processed Model Model_Processed
## M1_S1.S339        S1  7060 1777551.53  12         Young Sox10           Sox10
## M1_S1.S351        S1  7419 1435340.23  12         Young Sox10           Sox10
## M1_S1.S363        S1  7674 1483155.55  12         Young Sox10           Sox10
## M1_S1.S349        S1  8018  1505426.6  12         Young Sox10           Sox10
## M1_S1.S313        S1  6985 1412616.17  12         Young Sox10           Sox10
## M1_S1.S289        S1  7945 1210014.21  12         Young Sox10           Sox10
##            Segment Segment_Processed Sex Sex_Processed Time Time_Processed
## M1_S1.S339     All              <NA>   F             F <NA>           <NA>
## M1_S1.S351     All              <NA>   F             F <NA>           <NA>
## M1_S1.S363     All              <NA>   F             F <NA>           <NA>
## M1_S1.S349     All              <NA>   F             F <NA>           <NA>
## M1_S1.S313     All              <NA>   F             F <NA>           <NA>
## M1_S1.S289     All              <NA>   F             F <NA>           <NA>
##            sig_score_neur sig_score_glia  inlocal  inpaper
## M1_S1.S339           0.55           0.02 ss2_neur ss2_neur
## M1_S1.S351           0.99           0.07 ss2_neur ss2_neur
## M1_S1.S363           0.98           0.13 ss2_neur ss2_neur
## M1_S1.S349           0.97           0.04 ss2_neur ss2_neur
## M1_S1.S313           0.79           0.19 ss2_neur ss2_neur
## M1_S1.S289           0.88           0.03 ss2_neur ss2_neur

PCs

ElbowPlot(ss2_mix.seur,ndims = 30) + labs(title = "PCs range 1-30, take 1-16")

PC1&2

DimPlot(ss2_mix.seur, reduction = 'pca', dims = c(1,2)) + labs(title = "ss2_mix PC1+2 ")

DimPlot(ss2_mix.seur, reduction = 'pca', dims = c(1,2), group.by = 'Dataset') + labs(title = "ss2_mix PC1+2  with ss2_neur/glia labeling ")

tSNE

DimPlot(ss2_mix.seur, reduction = 'tsne') + labs(title = "ss2_mix tSNE")

DimPlot(ss2_mix.seur, reduction = 'tsne', group.by = 'Dataset' ) + labs(title = "ss2_mix tSNE  with ss2_neur/glia labeling")

Phenograph clusters

DimPlot(ss2_mix.seur, group.by = 'phenograph.k250', label = T) + labs(title = "ss2_mix Phenograph output")

neur/glia scores

check neur/glia sig_score of each cluster

pheno_clust <- ss2_mix.seur@meta.data[c("phenograph.k250","sig_score_neur","sig_score_glia","Dataset")]
pheno_clust <- group_by(pheno_clust, phenograph.k250)

pheno_clust <- summarise(pheno_clust,
                         count = n(),
                         neur_labeled = sum(Dataset=="Mouse colon enteric neurons (Smart-Seq2)"),
                         glia_labeled = sum(Dataset=="Mouse colon enteric glia (Smart-Seq2)"),
                         max_sig_neur = max(sig_score_neur),
                         max_sig_glia = max(sig_score_glia),
                         inlocal = if(max_sig_neur > max_sig_glia){"neuron"}else{"glia"})
data.frame(pheno_clust)
##   phenograph.k250 count neur_labeled glia_labeled max_sig_neur max_sig_glia
## 1               0  1721           23         1698         1.65         1.80
## 2               1   718            0          718         0.88         1.99
## 3               2   637           17          620         1.79         1.81
## 4               3   632          632            0         1.83         0.79
## 5               4   628          628            0         1.96         0.74
## 6               5   584          584            0         1.93         0.85
## 7               6   408          407            1         1.89         0.86
## 8               7   368          366            2         2.02         0.63
##   inlocal
## 1    glia
## 2    glia
## 3    glia
## 4  neuron
## 5  neuron
## 6  neuron
## 7  neuron
## 8  neuron

here we also get a few 'neur-like glia' or 'glia-like neur'
(I used to use "Sample_ID" as batch, do 'mean only' correction, and take top 1500 variable genes from whole dataset,
then only get 9 neur-like which all appear in 'marker genes' part upstairs different from labelings,
but these unmodified parameters failed in sub-clustering, so here just stay the same with next step)

('taking top 1500 vargenes from whole' and 'top 1500 from each batch, merging and taking sorted new top 1500 as vargenes' would have less than half genes overlap,
depending on 'Mouse or Sample to choose as one batch' and 'neur/glia or mixed')

# new batch info
var_c_mix <- as.factor(batch_use.ss2_mix)
levels(var_c_mix)[table(var_c_mix) < min_cells] <- "merge"

# top 1500 from whole once  
ss2_mix_var <- get_var_genes(ss2_mix.seur@assays[['RNA']]@data, samples=ss2_mix.seur$orig.ident, num_genes = 1500,min_cells = 20)

# top 1500 from each, merge, then new top 1500 from sorted by repeating times   
ss2_mix_var_c <- get_var_genes(ss2_mix.seur@assays[['RNA']]@data, samples = var_c_mix, num_genes = 1500,min_cells = 20)

cat("ss2_mix vargenes: 1500\noverlap of 'sorted merged from each batch' and 'from whole': ",
    sum(ss2_mix_var %in% ss2_mix_var_c),
    "\nspecific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp): ",length(grep('^Cd|^Ig|^Rn|^mt|^Rp', ss2_mix_var_c)))
## ss2_mix vargenes: 1500
## overlap of 'sorted merged from each batch' and 'from whole':  752 
## specific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp):  38
data.frame(pheno_clust)
##   phenograph.k250 count neur_labeled glia_labeled max_sig_neur max_sig_glia
## 1               0  1721           23         1698         1.65         1.80
## 2               1   718            0          718         0.88         1.99
## 3               2   637           17          620         1.79         1.81
## 4               3   632          632            0         1.83         0.79
## 5               4   628          628            0         1.96         0.74
## 6               5   584          584            0         1.93         0.85
## 7               6   408          407            1         1.89         0.86
## 8               7   368          366            2         2.02         0.63
##   inlocal
## 1    glia
## 2    glia
## 3    glia
## 4  neuron
## 5  neuron
## 6  neuron
## 7  neuron
## 8  neuron

could see pheno_clust '0' and '2' may be non-ideal to use max sig_score as a standard to decide glia/neuron,
though the final 'inlocal' cluster is 'right'
neur vs glia
1.65 vs 1.80
1.79 vs 1.81

outliers

check those cells different from neur/glia labelings
highest neur scores (1.65, 1.79) in glia are indeed from them

ss2_mix.seur@meta.data %>% filter(phenograph.k250 %in%  c("0","2") & Dataset == "Mouse colon enteric neurons (Smart-Seq2)")
##              orig.ident nCount_RNA nFeature_RNA phenograph.k250         NAME
## M17_S52.S162    ss2_mix  10000.000         7199               0 M17_S52.S162
## M17_S52.S189    ss2_mix   9999.986         4253               0 M17_S52.S189
## M19_S60.S294    ss2_mix   9999.819        11523               2 M19_S60.S294
## M22_S67.S64     ss2_mix  10000.000         5677               0  M22_S67.S64
## M22_S67.S68     ss2_mix   9999.966         5633               2  M22_S67.S68
## M22_S67.S72     ss2_mix   9999.848         7342               0  M22_S67.S72
## M22_S68.S184    ss2_mix  10000.000         8370               0 M22_S68.S184
## M22_S68.S113    ss2_mix  10000.000         8608               0 M22_S68.S113
## M22_S70.S314    ss2_mix   9999.981         4537               2 M22_S70.S314
## M22_S70.S351    ss2_mix  10000.000         3806               2 M22_S70.S351
## M22_S70.S308    ss2_mix   9999.839         5553               0 M22_S70.S308
## M22_S71.S74     ss2_mix   9999.975         5579               2  M22_S71.S74
## M22_S71.S25     ss2_mix   9993.599         6246               0  M22_S71.S25
## M22_S71.S63     ss2_mix   9999.894         7596               0  M22_S71.S63
## M22_S72.S164    ss2_mix  10000.000         5235               2 M22_S72.S164
## M22_S72.S145    ss2_mix  10000.000         2741               2 M22_S72.S145
## M22_S73.S257    ss2_mix   9999.764         5945               0 M22_S73.S257
## M22_S73.S193    ss2_mix  10000.000         4954               0 M22_S73.S193
## M22_S74.S306    ss2_mix  10000.000         3120               2 M22_S74.S306
## M24_S79.S74     ss2_mix  10000.000         6703               0  M24_S79.S74
## M24_S80.S181    ss2_mix  10000.000         6094               0 M24_S80.S181
## M24_S80.S136    ss2_mix  10000.000         6686               0 M24_S80.S136
## M24_S80.S125    ss2_mix  10000.000         5053               0 M24_S80.S125
## M24_S80.S106    ss2_mix   9999.140         6707               0 M24_S80.S106
## M24_S80.S105    ss2_mix  10000.000         3031               0 M24_S80.S105
## M24_S80.S103    ss2_mix  10000.000         4786               0 M24_S80.S103
## M24_S80.S172    ss2_mix   9999.773         6151               0 M24_S80.S172
## M24_S80.S113    ss2_mix   9999.984         5009               0 M24_S80.S113
## M24_S80.S151    ss2_mix  10000.000         7006               2 M24_S80.S151
## M24_S81.S211    ss2_mix   9999.964         5268               2 M24_S81.S211
## M24_S82.S380    ss2_mix  10000.000         5795               2 M24_S82.S380
## M24_S82.S298    ss2_mix  10000.000         7534               0 M24_S82.S298
## M24_S82.S305    ss2_mix  10000.000         4622               2 M24_S82.S305
## M24_S82.S329    ss2_mix  10000.000         6585               2 M24_S82.S329
## M25_S83.S51     ss2_mix   9986.817         7383               2  M25_S83.S51
## M25_S83.S1      ss2_mix   9999.443         7337               2   M25_S83.S1
## M25_S83.S6      ss2_mix   9999.978         7874               2   M25_S83.S6
## M25_S84.S163    ss2_mix  10000.000         4423               2 M25_S84.S163
## M25_S84.S97     ss2_mix   9999.288         4948               0  M25_S84.S97
## M25_S85.S255    ss2_mix   9999.120         9038               0 M25_S85.S255
##              Annotation                                  Dataset Mouse_ID
## M17_S52.S162     PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M17
## M17_S52.S189     PIMN_5 Mouse colon enteric neurons (Smart-Seq2)      M17
## M19_S60.S294     PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M19
## M22_S67.S64      PIMN_6 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S67.S68      PSVN_2 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S67.S72      PIMN_7 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S68.S184     PIMN_7 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S68.S113     PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S70.S314     PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S70.S351     PIMN_3 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S70.S308     PIMN_5 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S71.S74      PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S71.S25      PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S71.S63      PIMN_1 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S72.S164     PIMN_1 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S72.S145      PSN_1 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S73.S257     PIMN_3 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S73.S193     PIMN_4 Mouse colon enteric neurons (Smart-Seq2)      M22
## M22_S74.S306     PIMN_1 Mouse colon enteric neurons (Smart-Seq2)      M22
## M24_S79.S74      PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S80.S181     PIMN_5 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S80.S136     PIMN_5 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S80.S125     PIMN_5 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S80.S106     PIMN_6 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S80.S105     PIMN_6 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S80.S103     PIMN_5 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S80.S172     PIMN_5 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S80.S113     PIMN_2 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S80.S151     PEMN_1 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S81.S211     PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S82.S380     PIMN_3 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S82.S298     PIMN_3 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S82.S305     PIMN_3 Mouse colon enteric neurons (Smart-Seq2)      M24
## M24_S82.S329     PIMN_1 Mouse colon enteric neurons (Smart-Seq2)      M24
## M25_S83.S51      PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M25
## M25_S83.S1       PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M25
## M25_S83.S6       PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M25
## M25_S84.S163      PSN_1 Mouse colon enteric neurons (Smart-Seq2)      M25
## M25_S84.S97      PSVN_1 Mouse colon enteric neurons (Smart-Seq2)      M25
## M25_S85.S255     PIMN_1 Mouse colon enteric neurons (Smart-Seq2)      M25
##              Unique_ID nGene       nUMI Age Age_Processed Model Model_Processed
## M17_S52.S162       S52  7199  758737.96  12         Young Uchl1           Uchl1
## M17_S52.S189       S52  4254   689184.7  12         Young Uchl1           Uchl1
## M19_S60.S294       S60 11526 1818436.95  12         Young Uchl1           Uchl1
## M22_S67.S64        S67  5677  274866.18  12         Young Uchl1           Uchl1
## M22_S67.S68        S67  5634  292735.49  12         Young Uchl1           Uchl1
## M22_S67.S72        S67  7343  263703.56  12         Young Uchl1           Uchl1
## M22_S68.S184       S68  8370  962725.77  12         Young Uchl1           Uchl1
## M22_S68.S113       S68  8608  693794.32  12         Young Uchl1           Uchl1
## M22_S70.S314       S70  4538  528050.12  12         Young Uchl1           Uchl1
## M22_S70.S351       S70  3806  342232.35  12         Young Uchl1           Uchl1
## M22_S70.S308       S70  5555   807938.5  12         Young Uchl1           Uchl1
## M22_S71.S74        S71  5580     643098  12         Young Uchl1           Uchl1
## M22_S71.S25        S71  6249 1010790.31  12         Young Uchl1           Uchl1
## M22_S71.S63        S71  7598   751938.2  12         Young Uchl1           Uchl1
## M22_S72.S164       S72  5236  954096.53  12         Young Uchl1           Uchl1
## M22_S72.S145       S72  2741  557430.18  12         Young Uchl1           Uchl1
## M22_S73.S257       S73  5947  591157.85  12         Young Uchl1           Uchl1
## M22_S73.S193       S73  4954  975702.49  12         Young Uchl1           Uchl1
## M22_S74.S306       S74  3120  295490.49  12         Young Uchl1           Uchl1
## M24_S79.S74        S79  6703  637414.81  12         Young Uchl1           Uchl1
## M24_S80.S181       S80  6094  570961.05  12         Young Uchl1           Uchl1
## M24_S80.S136       S80  6686  839451.89  12         Young Uchl1           Uchl1
## M24_S80.S125       S80  5053  507444.64  12         Young Uchl1           Uchl1
## M24_S80.S106       S80  6710  697658.82  12         Young Uchl1           Uchl1
## M24_S80.S105       S80  3031   26887.15  12         Young Uchl1           Uchl1
## M24_S80.S103       S80  4786   172529.5  12         Young Uchl1           Uchl1
## M24_S80.S172       S80  6153  703297.68  12         Young Uchl1           Uchl1
## M24_S80.S113       S80  5010  487671.81  12         Young Uchl1           Uchl1
## M24_S80.S151       S80  7006     791904  12         Young Uchl1           Uchl1
## M24_S81.S211       S81  5269  557690.12  12         Young Uchl1           Uchl1
## M24_S82.S380       S82  5795  786853.47  12         Young Uchl1           Uchl1
## M24_S82.S298       S82  7534  835317.43  12         Young Uchl1           Uchl1
## M24_S82.S305       S82  4622  438829.78  12         Young Uchl1           Uchl1
## M24_S82.S329       S82  6585  715351.04  12         Young Uchl1           Uchl1
## M25_S83.S51        S83  7390   921148.6  11         Young Uchl1           Uchl1
## M25_S83.S1         S83  7340  699579.13  11         Young Uchl1           Uchl1
## M25_S83.S6         S83  7875  908440.68  11         Young Uchl1           Uchl1
## M25_S84.S163       S84  4423  784851.73  12         Young Uchl1           Uchl1
## M25_S84.S97        S84  4951  393257.21  12         Young Uchl1           Uchl1
## M25_S85.S255       S85  9039  932142.48  11         Young Uchl1           Uchl1
##              Segment Segment_Processed Sex Sex_Processed Time Time_Processed
## M17_S52.S162       2          Proximal   M             M  7AM            7AM
## M17_S52.S189       2          Proximal   M             M  7AM            7AM
## M19_S60.S294       4            Distal   M             M  7AM            7AM
## M22_S67.S64        1          Proximal   M             M  7PM            7PM
## M22_S67.S68        1          Proximal   M             M  7PM            7PM
## M22_S67.S72        1          Proximal   M             M  7PM            7PM
## M22_S68.S184       2          Proximal   M             M  7PM            7PM
## M22_S68.S113       2          Proximal   M             M  7PM            7PM
## M22_S70.S314       4            Distal   M             M  7PM            7PM
## M22_S70.S351       4            Distal   M             M  7PM            7PM
## M22_S70.S308       4            Distal   M             M  7PM            7PM
## M22_S71.S74        1          Proximal   M             M  7PM            7PM
## M22_S71.S25        1          Proximal   M             M  7PM            7PM
## M22_S71.S63        1          Proximal   M             M  7PM            7PM
## M22_S72.S164       2          Proximal   M             M  7PM            7PM
## M22_S72.S145       2          Proximal   M             M  7PM            7PM
## M22_S73.S257       3            Distal   M             M  7PM            7PM
## M22_S73.S193       3            Distal   M             M  7PM            7PM
## M22_S74.S306       4            Distal   M             M  7PM            7PM
## M24_S79.S74        1          Proximal   F             F  7PM            7PM
## M24_S80.S181       2          Proximal   F             F  7PM            7PM
## M24_S80.S136       2          Proximal   F             F  7PM            7PM
## M24_S80.S125       2          Proximal   F             F  7PM            7PM
## M24_S80.S106       2          Proximal   F             F  7PM            7PM
## M24_S80.S105       2          Proximal   F             F  7PM            7PM
## M24_S80.S103       2          Proximal   F             F  7PM            7PM
## M24_S80.S172       2          Proximal   F             F  7PM            7PM
## M24_S80.S113       2          Proximal   F             F  7PM            7PM
## M24_S80.S151       2          Proximal   F             F  7PM            7PM
## M24_S81.S211       3            Distal   F             F  7PM            7PM
## M24_S82.S380       4            Distal   F             F  7PM            7PM
## M24_S82.S298       4            Distal   F             F  7PM            7PM
## M24_S82.S305       4            Distal   F             F  7PM            7PM
## M24_S82.S329       4            Distal   F             F  7PM            7PM
## M25_S83.S51        1          Proximal   F             F  7AM            7AM
## M25_S83.S1         1          Proximal   F             F  7AM            7AM
## M25_S83.S6         1          Proximal   F             F  7AM            7AM
## M25_S84.S163       2          Proximal   F             F  7AM            7AM
## M25_S84.S97        2          Proximal   F             F  7AM            7AM
## M25_S85.S255       2          Proximal   F             F  7AM            7AM
##              sig_score_neur sig_score_glia  inlocal  inpaper
## M17_S52.S162           0.76           0.27 ss2_glia ss2_neur
## M17_S52.S189           0.75           0.34 ss2_glia ss2_neur
## M19_S60.S294           1.79           0.22 ss2_glia ss2_neur
## M22_S67.S64            0.97           0.04 ss2_glia ss2_neur
## M22_S67.S68            0.69           0.09 ss2_glia ss2_neur
## M22_S67.S72            1.65           0.17 ss2_glia ss2_neur
## M22_S68.S184           1.30           0.17 ss2_glia ss2_neur
## M22_S68.S113           1.02           0.29 ss2_glia ss2_neur
## M22_S70.S314           1.01           0.14 ss2_glia ss2_neur
## M22_S70.S351           1.52           0.07 ss2_glia ss2_neur
## M22_S70.S308           0.84           0.02 ss2_glia ss2_neur
## M22_S71.S74            0.79           0.01 ss2_glia ss2_neur
## M22_S71.S25            0.74           0.20 ss2_glia ss2_neur
## M22_S71.S63            1.04           0.10 ss2_glia ss2_neur
## M22_S72.S164           1.33           0.27 ss2_glia ss2_neur
## M22_S72.S145           0.86           0.00 ss2_glia ss2_neur
## M22_S73.S257           1.25           0.04 ss2_glia ss2_neur
## M22_S73.S193           1.01           0.17 ss2_glia ss2_neur
## M22_S74.S306           1.11           0.00 ss2_glia ss2_neur
## M24_S79.S74            1.02           0.18 ss2_glia ss2_neur
## M24_S80.S181           0.61           0.17 ss2_glia ss2_neur
## M24_S80.S136           0.54           0.06 ss2_glia ss2_neur
## M24_S80.S125           0.91           0.03 ss2_glia ss2_neur
## M24_S80.S106           0.51           0.06 ss2_glia ss2_neur
## M24_S80.S105           0.85           0.16 ss2_glia ss2_neur
## M24_S80.S103           0.56           0.01 ss2_glia ss2_neur
## M24_S80.S172           0.64           0.09 ss2_glia ss2_neur
## M24_S80.S113           0.89           0.27 ss2_glia ss2_neur
## M24_S80.S151           0.80           0.03 ss2_glia ss2_neur
## M24_S81.S211           0.38           0.04 ss2_glia ss2_neur
## M24_S82.S380           1.63           0.11 ss2_glia ss2_neur
## M24_S82.S298           1.64           0.14 ss2_glia ss2_neur
## M24_S82.S305           1.26           0.03 ss2_glia ss2_neur
## M24_S82.S329           1.75           0.24 ss2_glia ss2_neur
## M25_S83.S51            0.89           0.17 ss2_glia ss2_neur
## M25_S83.S1             0.92           0.21 ss2_glia ss2_neur
## M25_S83.S6             1.16           0.13 ss2_glia ss2_neur
## M25_S84.S163           1.40           0.11 ss2_glia ss2_neur
## M25_S84.S97            0.82           0.07 ss2_glia ss2_neur
## M25_S85.S255           1.51           0.21 ss2_glia ss2_neur
ss2_mix.seur@meta.data %>% filter(phenograph.k250 %in%  c("6","7") & Dataset == "Mouse colon enteric glia (Smart-Seq2)")
##              orig.ident nCount_RNA nFeature_RNA phenograph.k250         NAME
## M8_S16.S132     ss2_mix  10000.000         6315               6  M8_S16.S132
## M22_S73.S222    ss2_mix   9999.988         4406               7 M22_S73.S222
## M28_S96.S169    ss2_mix  10000.000         5196               7 M28_S96.S169
##              Annotation                               Dataset Mouse_ID
## M8_S16.S132      Glia_2 Mouse colon enteric glia (Smart-Seq2)       M8
## M22_S73.S222     Glia_2 Mouse colon enteric glia (Smart-Seq2)      M22
## M28_S96.S169     Glia_2 Mouse colon enteric glia (Smart-Seq2)      M28
##              Unique_ID nGene      nUMI Age Age_Processed Model Model_Processed
## M8_S16.S132        S16  6315 1038469.5  12         Young Sox10           Sox10
## M22_S73.S222       S73  4407 860896.83  12         Young Uchl1           Uchl1
## M28_S96.S169       S96  5196 732008.35 105           Old Sox10           Sox10
##              Segment Segment_Processed Sex Sex_Processed Time Time_Processed
## M8_S16.S132        2          Proximal   F             F  7PM            7PM
## M22_S73.S222       3            Distal   M             M  7PM            7PM
## M28_S96.S169       2          Proximal   M             M  7AM            7AM
##              sig_score_neur sig_score_glia  inlocal  inpaper
## M8_S16.S132            0.28           0.73 ss2_neur ss2_glia
## M22_S73.S222           0.96           0.07 ss2_neur ss2_glia
## M28_S96.S169           0.31           0.63 ss2_neur ss2_glia

except a small part have low or close neur/glia score, not sure about why others get into a cluster which is different from its labeling.
here just make a mark for them, sub-clustering would still use initially labeled ss2_neur/glia as input instead of this one.

result and check

inlocal final result for mixed ss2_neur/glia

ss2_mix.seur@meta.data$inlocal <- ss2_mix.seur@meta.data$phenograph.k250
for(i in 1:length(ss2_mix.seur@meta.data$inlocal)){
  if(ss2_mix.seur@meta.data$inlocal[i] %in% c("0","1","2")){
    ss2_mix.seur@meta.data$inlocal[i] <- "ss2_glia"
  }else{
      ss2_mix.seur@meta.data$inlocal[i] <- "ss2_neur"
    }
}

DimPlot(ss2_mix.seur, group.by = 'inlocal') + labs(title = "ss2_mix inlocal")

inpaper's

ss2_mix.seur@meta.data$inpaper <- ss2_mix.seur@meta.data$Dataset
for(i in 1:length(ss2_mix.seur@meta.data$inpaper)){
  if(ss2_mix.seur@meta.data$inpaper[i] %in% c("Mouse colon enteric glia (Smart-Seq2)")){
    ss2_mix.seur@meta.data$inpaper[i] <- "ss2_glia"
  }else{
      ss2_mix.seur@meta.data$inpaper[i] <- "ss2_neur"
    }
}

DimPlot(ss2_mix.seur, group.by = 'inpaper') + labs(title = "ss2_mix inpaper")

check main division marker genes of neur
three pairs, all devide neur into two parts,
could see Chat-Nos1 works well for ss2_neur uniquely, the other two pairs have more or less expression on glia

FeaturePlot(ss2_mix.seur, features = c("Chat","Nos1","Gfra2","Gfra1","Casz1","Etv1"))

check given marker genes of glia,
could see these three genes faintly devide glia into two or three parts from top-right to down-left,
but not so clear comparing to neur

FeaturePlot(ss2_mix.seur, features = c("Gfra2","Slc18a2","Ntsr1"))

#saveRDS(ss2_mix.seur,"ss2_mix.seur.rds")
#rm(ss2_mix.seur,ss2_mix.counts)

subclustering

batch info

check how many min_cells to choose to merge batch

cat("profiled ss2_neur in each Mouse: \n");table(allmeta_ss2_neur$Mouse_ID)[paste0("M",1:29)]
## profiled ss2_neur in each Mouse:
## 
##  M1  M2  M3  M4  M5  M6  M7  M8  M9 M10 M11 M12 M13 M14 M15 M16 M17 M18 M19 M20 
##  10   4   4  23  14  15 103  56  71  60  76  56  71   1   9  73 162 119  80  26 
## M21 M22 M23 M24 M25 M26 M27 M28 M29 
##  61 651 104 316 296  21  51  66  58
cat("\n quantile statistics: \n");quantile(table(allmeta_ss2_neur$Mouse_ID),probs=seq(0,1,0.05))
## 
##  quantile statistics:
##    0%    5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55%   60% 
##   1.0   4.0   8.0  10.8  14.6  21.0  24.2  46.0  56.0  57.2  60.0  63.0  70.0 
##   65%   70%   75%   80%   85%   90%   95%  100% 
##  71.4  74.8  80.0 103.4 116.0 188.8 308.0 651.0
min_cells <- 5
cat("ss2_neur mice: ",length(table(allmeta_ss2_neur$Mouse_ID)),
    "\nss2_neur count: ",sum(table(allmeta_ss2_neur$Mouse_ID)),
    paste0("\nmin_cell < ",min_cells," : "),
    "\n    mice ",sum(table(allmeta_ss2_neur$Mouse_ID) < min_cells),
    "\n    count ",sum((table(allmeta_ss2_neur$Mouse_ID))[table(allmeta_ss2_neur$Mouse_ID) < min_cells]))
## ss2_neur mice:  29 
## ss2_neur count:  2657 
## min_cell < 5 :  
##     mice  3 
##     count  9
cat("profiled ss2_glia in each Mouse: \n");table(allmeta_ss2_glia$Mouse_ID)[paste0("M",1:29)]
## profiled ss2_glia in each Mouse:
## 
##  M1  M2  M3  M4  M5  M6  M7  M8  M9 M10 M11 M12 M13 M14 M15 M16 M17 M18 M19 M20 
##  14  21  16  51  54  31 312 173 184 193 208 186 189   1   5 267   2 200   1 112 
## M21 M22 M23 M24 M25 M26 M27 M28 M29 
## 165   7 202   2   3  59 131 106 144
cat("\n quantile statistics: \n");quantile(table(allmeta_ss2_glia$Mouse_ID),probs=seq(0,1,0.05))
## 
##  quantile statistics:
##    0%    5%   10%   15%   20%   25%   30%   35%   40%   45%   50%   55%   60% 
##   1.0   1.4   2.0   3.4   6.2  14.0  18.0  29.0  51.6  57.0 106.0 119.6 141.4 
##   65%   70%   75%   80%   85%   90%   95%  100% 
## 166.6 179.6 186.0 190.6 198.6 203.2 243.4 312.0
min_cells <- 5
cat("ss2_glia mice: ",length(table(allmeta_ss2_glia$Mouse_ID)),
    "\nss2_glia count: ",sum(table(allmeta_ss2_glia$Mouse_ID)),
    paste0("\nmin_cell < ",min_cells," : "),
    "\n    mice ",sum(table(allmeta_ss2_glia$Mouse_ID) < min_cells),
    "\n    count ",sum((table(allmeta_ss2_glia$Mouse_ID))[table(allmeta_ss2_glia$Mouse_ID) < min_cells]))
## ss2_glia mice:  29 
## ss2_glia count:  3039 
## min_cell < 5 :  
##     mice  5 
##     count  9

subclustering

modified parameters for Phenograph in paper:
ss2_neur PCs 30 1-15, 'k'NN 15
ss2_glia PCs 15 1-9, 'k'NN 500

# ss2_neur
batch_use.ss2_neur <- as.character(allmeta_ss2_neur$Mouse_ID)
names(batch_use.ss2_neur) <- rownames(allmeta_ss2_neur)
batch_use.ss2_neur <- factor(batch_use.ss2_neur)

#ss2_neur.seur <- run_analysis(name="ss2_neur",
#                             counts=ss2_neur.counts,
#                             do.batch=TRUE,
#                             batch.use=batch_use.ss2_neur,
#                             min_cells=5,
#                             var_regex='^Cd|^Ig|^Rn|^mt|^Rp',
#                             num_pcs=30,
#                             clust_pcs=15,
#                             k=15)

#ss2_neur.seur@meta.data <- cbind(ss2_neur.seur@meta.data, allmeta_ss2_neur[rownames(ss2_neur.seur@meta.data),])


# ss2_glia
batch_use.ss2_glia <- as.character(allmeta_ss2_glia$Mouse_ID)
names(batch_use.ss2_glia) <- rownames(allmeta_ss2_glia)
batch_use.ss2_glia <- factor(batch_use.ss2_glia)

#ss2_glia.seur <- run_analysis(name="ss2_glia",
#                             counts=ss2_glia.counts,
#                             do.batch=TRUE,
#                             batch.use=batch_use.ss2_glia,
#                             min_cells=5,
#                             var_regex='^Cd|^Ig|^Rn|^mt|^Rp',
#                             num_pcs=15,
#                             clust_pcs=9,
#                             k=500)

#ss2_glia.seur@meta.data <- cbind(ss2_glia.seur@meta.data, allmeta_ss2_glia[rownames(ss2_glia.seur@meta.data),])
#saveRDS(ss2_neur.seur,"ss2_neur.seur.rds")
#saveRDS(ss2_glia.seur,"ss2_glia.seur.rds")

because this kind of step takes time and everytime to run would get a little different result,
here just use same parameters to run once then annotate the code, next always use the saved seurat object for analysis

#head(ss2_neur.seur@meta.data)
#head(ss2_glia.seur@meta.data)

here's a small bug for ss2_glia I have never solved after running this for so many times,
ss2_glia.seur has an orig.ident and active.ident named by "Mouse_Sample" from its sample names instead of specified 'name' in the script,
but with the same code, ss2_neur.seur's org.ident and active.ident are always the value of 'name',
this only affects the plotting, guess there's sth. different between ss2_neur/glia mtx files, just fix it

#ss2_glia.seur@active.ident <- ss2_glia.seur@meta.data$orig.ident <- factor("ss2_glia")
#saveRDS(ss2_neur.seur,"ss2_neur.seur.rds")
#saveRDS(ss2_glia.seur,"ss2_glia.seur.rds")
ss2_neur.seur <- readRDS("ss2_neur.seur.rds")
ss2_glia.seur <- readRDS("ss2_glia.seur.rds")

var genes

check variable genes to use

# new batch info
var_c_neur <- as.factor(batch_use.ss2_neur)
levels(var_c_neur)[table(var_c_neur) < min_cells] <- "merge"

# top 1500 from whole once  
ss2_neur_var <- get_var_genes(ss2_neur.seur@assays[['RNA']]@data, samples=ss2_neur.seur$orig.ident, num_genes = 1500,min_cells = 5)

# top 1500 from each, merge, then new top 1500 from sorted by repeating times   
ss2_neur_var_c <- get_var_genes(ss2_neur.seur@assays[['RNA']]@data, samples = var_c_neur, num_genes = 1500,min_cells = 5)

cat("ss2_neur vargenes: 1500\noverlap of 'sorted merged from each batch' and 'from whole': ",
    sum(ss2_neur_var %in% ss2_neur_var_c),
    "\nspecific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp): ",length(grep('^Cd|^Ig|^Rn|^mt|^Rp', ss2_neur_var_c)))
## ss2_neur vargenes: 1500
## overlap of 'sorted merged from each batch' and 'from whole':  887 
## specific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp):  32
# new batch info
var_c_glia <- as.factor(batch_use.ss2_glia)
levels(var_c_glia)[table(var_c_glia) < min_cells] <- "merge"

# top 1500 from whole once  
ss2_glia_var <- get_var_genes(ss2_glia.seur@assays[['RNA']]@data, samples=ss2_glia.seur$orig.ident, num_genes = 1500,min_cells = 5)

# top 1500 from each, merge, then new top 1500 from sorted by repeating times   
ss2_glia_var_c <- get_var_genes(ss2_glia.seur@assays[['RNA']]@data, samples = var_c_glia, num_genes = 1500,min_cells = 5)

cat("ss2_glia vargenes: 1500\noverlap of 'sorted merged from each batch' and 'from whole': ",
    sum(ss2_glia_var %in% ss2_glia_var_c),
    "\nspecific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp): ",length(grep('^Cd|^Ig|^Rn|^mt|^Rp', ss2_glia_var_c)))
## ss2_glia vargenes: 1500
## overlap of 'sorted merged from each batch' and 'from whole':  811 
## specific genes to exclude (^Cd|^Ig|^Rn|^mt|^Rp):  31

here comes a question I haven't explored the answers.
who not directly use variable genes or even corrected batches from ss2_mix's ?
here just restart those steps for sub-clustering.

ss2_neur

check results

PCs

ElbowPlot(ss2_neur.seur,ndims = 30) + labs(title = "ss2_neur PCs, take 1-15")

PC1&2

DimPlot(ss2_neur.seur, reduction = 'pca', dims = c(1,2)) + labs(title = "ss2_neur PC1+2 ")

DimPlot(ss2_neur.seur, reduction = 'pca', dims = c(1,2), group.by = 'Annotation') + labs(title = "ss2_neur PC1+2  with inpaper annotations")

tSNE

DimPlot(ss2_neur.seur, reduction = 'tsne') + labs(title = "ss2_neur tSNE")

DimPlot(ss2_neur.seur, reduction = 'tsne', group.by = 'Annotation',label = T ) + labs(title = "ss2_neur tSNE  with  inpaper annotations")

Phenograph clusters

DimPlot(ss2_neur.seur, group.by = 'phenograph.k15', label = T) + labs(title = "ss2_neur Phenograph output")

annotation

could see ss2_neur get 21 (inpaper) + 4 more (inlocal) = 25 sub-clusters.
how to really annotate them need to combine considerations of 'modification of clustering parameters' and 'marker genes of each sub-cluster relating to biological background',
it shouldn't be an easy part, but here we just do it in an easy way to annotate each cluster for a similar result like inpaper's

pheno_neur <- ss2_neur.seur@meta.data[c('phenograph.k15','Annotation')]
pheno_neur <- group_by(pheno_neur, phenograph.k15)

pheno_neur <- summarise(pheno_neur,
                         count=n(),
                         PEMN_1_inpaper = sum(Annotation == 'PEMN_1'),
                         PEMN_2_inpaper = sum(Annotation == 'PEMN_2'),
                         PEMN_3_inpaper = sum(Annotation == 'PEMN_3'),
                         PEMN_4_inpaper = sum(Annotation == 'PEMN_4'),
                         PEMN_5_inpaper = sum(Annotation == 'PEMN_5'),
                         PIMN_1_inpaper = sum(Annotation == 'PIMN_1'),
                         PIMN_2_inpaper = sum(Annotation == 'PIMN_2'),
                         PIMN_3_inpaper = sum(Annotation == 'PIMN_3'),
                         PIMN_4_inpaper = sum(Annotation == 'PIMN_4'),
                         PIMN_5_inpaper = sum(Annotation == 'PIMN_5'),
                         PIMN_6_inpaper = sum(Annotation == 'PIMN_6'),
                         PIMN_7_inpaper = sum(Annotation == 'PIMN_7'),
                         PIN_1_inpaper = sum(Annotation == 'PIN_1'),
                         PIN_2_inpaper = sum(Annotation == 'PIN_2'),
                         PIN_3_inpaper = sum(Annotation == 'PIN_3'),
                         PSN_1_inpaper = sum(Annotation == 'PSN_1'),
                         PSN_2_inpaper = sum(Annotation == 'PSN_2'),
                         PSN_3_inpaper = sum(Annotation == 'PSN_3'),
                         PSN_4_inpaper = sum(Annotation == 'PSN_4'),
                         PSVN_1_inpaper = sum(Annotation == 'PSVN_1'),
                         PSVN_2_inpaper = sum(Annotation == 'PSVN_2'))
  
#pheno_neur
pheno_neur <- rbind(pheno_neur,c("count",colSums(pheno_neur[,2:ncol(pheno_neur)])))
pheno_neur <- data.frame(t(pheno_neur))[c(1,3:23,2),][,c(order(as.numeric(pheno_neur$phenograph.k15[1:25])),26)]
colnames(pheno_neur) <- c(paste0("pheno_",pheno_neur[1,1:ncol(pheno_neur)-1]),"count")
pheno_neur <- pheno_neur[2:nrow(pheno_neur),]
pheno_neur
##                pheno_0 pheno_1 pheno_2 pheno_3 pheno_4 pheno_5 pheno_6 pheno_7
## PEMN_1_inpaper       0       0       0       0       0       0       0     101
## PEMN_2_inpaper       0       0       0       0       0       0       1      21
## PEMN_3_inpaper       0     191       0       1       0       1      21       1
## PEMN_4_inpaper       0      12       0       0       0       0      45       0
## PEMN_5_inpaper       0       7       0       0       0       0      57       0
## PIMN_1_inpaper       0       0       0       0       3      33       0       0
## PIMN_2_inpaper      21       0       0       0     118      38       0       0
## PIMN_3_inpaper       0       0       0       0       6      65       0       0
## PIMN_4_inpaper     198       0      23       0      53       1       0       0
## PIMN_5_inpaper      13       0     168       5       5       0       0       0
## PIMN_6_inpaper       0       0       7     189       6       1       0       0
## PIMN_7_inpaper       1       0       0       1       0       0       0       0
## PIN_1_inpaper        0       0       0       0       0       0       0       0
## PIN_2_inpaper        0       0       0       0       0       0       0       0
## PIN_3_inpaper        0       0       0       0       0       0       0       0
## PSN_1_inpaper        0       0       0       0       0       0       0       0
## PSN_2_inpaper        0       0       0       0       0       0       0       0
## PSN_3_inpaper        0       0       0       0       0       0       0       0
## PSN_4_inpaper        0      20       0       0       0       0       2       0
## PSVN_1_inpaper       1       0       0       0       0       6       0       0
## PSVN_2_inpaper       0       0       1       0       0       0       0       0
## count              234     230     199     196     191     145     126     123
##                pheno_8 pheno_9 pheno_10 pheno_11 pheno_12 pheno_13 pheno_14
## PEMN_1_inpaper       0       0        0        0        4        1        0
## PEMN_2_inpaper       0       0        0        0       47        0        0
## PEMN_3_inpaper       0       0        0        0        0        0        0
## PEMN_4_inpaper       0       0        0        0       18        0        0
## PEMN_5_inpaper       0       0        0        2       19        0        0
## PIMN_1_inpaper       0       0       37        0        0        0        1
## PIMN_2_inpaper       0       0        0        0        0        0        0
## PIMN_3_inpaper       0       0        0        0        0        0        0
## PIMN_4_inpaper       0       0        0        0        0        0        0
## PIMN_5_inpaper       0       0        3        0        0        0        0
## PIMN_6_inpaper       0       0       58        0        0        0        0
## PIMN_7_inpaper       0       0        1        0        0        0        0
## PIN_1_inpaper       68       0        0        0        0        1       64
## PIN_2_inpaper        0       0        0        0        0       27        0
## PIN_3_inpaper        0       0        0        0        0       58        0
## PSN_1_inpaper        0       0        0        2        0        1        0
## PSN_2_inpaper        0       0        0        0        0        0        0
## PSN_3_inpaper       38       0        0        0        0        0       18
## PSN_4_inpaper        0       0        0       92        0        0        0
## PSVN_1_inpaper       0      19        0        0        0        0        0
## PSVN_2_inpaper       0      80        0        0        0        0        0
## count              106      99       99       96       88       88       83
##                pheno_15 pheno_16 pheno_17 pheno_18 pheno_19 pheno_20 pheno_21
## PEMN_1_inpaper        0       22        0        0        0        0        0
## PEMN_2_inpaper        0        5        0        0        0        3        0
## PEMN_3_inpaper        7        1        0        2        0       18        0
## PEMN_4_inpaper        0       49        0        0        0        3        0
## PEMN_5_inpaper        0        4        0        0        0       30        0
## PIMN_1_inpaper        0        0        0        0        0        0       28
## PIMN_2_inpaper        0        0        0        0        0        0        0
## PIMN_3_inpaper        0        0        0        0        0        0       10
## PIMN_4_inpaper        0        0        0        0        0        0        0
## PIMN_5_inpaper        0        0        0        0        0        0        0
## PIMN_6_inpaper        0        0        0        0        0        0        0
## PIMN_7_inpaper        0        0        0        0        0        0        0
## PIN_1_inpaper         0        0        0        0        0        0        0
## PIN_2_inpaper         0        0       78        0        0        0        0
## PIN_3_inpaper         0        0        1        0        0        0        0
## PSN_1_inpaper         0        0        0        0       51        0        0
## PSN_2_inpaper         0        0        0        0        5        0        0
## PSN_3_inpaper         0        0        0        0        0        0        0
## PSN_4_inpaper        75        0        0        0        0        1        0
## PSVN_1_inpaper        0        0        0       72        0        0        4
## PSVN_2_inpaper        0        0        0        0        0        0        0
## count                82       81       79       74       56       55       42
##                pheno_22 pheno_23 pheno_24 count
## PEMN_1_inpaper        4        0        0   132
## PEMN_2_inpaper       28        0        0   105
## PEMN_3_inpaper        0        0        0   243
## PEMN_4_inpaper        0        0        0   127
## PEMN_5_inpaper        0        0        0   119
## PIMN_1_inpaper        0        0        0   102
## PIMN_2_inpaper        0        0        0   177
## PIMN_3_inpaper        0        0        0    81
## PIMN_4_inpaper        0        0        0   275
## PIMN_5_inpaper        0        1        0   195
## PIMN_6_inpaper        0        6        0   267
## PIMN_7_inpaper        0       20        0    23
## PIN_1_inpaper         0        0        0   133
## PIN_2_inpaper         0        0        0   105
## PIN_3_inpaper         1        0        0    60
## PSN_1_inpaper         0        0        2    56
## PSN_2_inpaper         0        0       23    28
## PSN_3_inpaper         0        0        0    56
## PSN_4_inpaper         0        0        0   190
## PSVN_1_inpaper        0        0        0   102
## PSVN_2_inpaper        0        0        0    81
## count                33       27       25  2657
pheatmap::pheatmap(pheno_neur %>%  mutate_all(as.numeric), cluster_rows = F, cluster_cols = F, breaks = seq(0,300,3),
                   display_numbers =T, number_format = '%.0f', labels_row= rownames(pheno_neur))

check for each sub-clusters_inpaper

PEMNs_inpaper: totoally separated from other groups except to share pheno "1" (20, 10%) with PSN_4, but get mixed inside, _3 share a little "1" with _4/5, _2/4/5 share "12", so do pheno "16","20","6","7"

PIMNs_inpaper: similarly mixed inside, share pheno "0","10","2","23","3-5"

PINS_inpaper: mainly contain pheno "13" (_2/3 share),"14","17","8"

PSNs_inpaper: mainly contain pheno "11","15","19","24", _4 take some "1" from PEMN_3, _3 take some "14" and "8" from PIN_1

PSVNs_inpaper: mainly contain pheno "18" and "9"(_1 19, _2 80)

then we have

PEMNs_inlocal: pheno "7","12","22","1","16","20","6" as PEMN_1-7

PIMNs_inlocal: pheno "21","4","5","0","2","3","10","23" as PIMN_1-8

PINs_inlocal: pheno "8","14","17","13" as PIN_1-4

PSNs_inlocal: pheno "19","24","11","15" as PSN_1-4

PSVNs_inlocal: pheno "18","9" as PSVN_1-4

this might be laboursome to do it manually,
one way to do it automatically could be sorting the max count for each one to get a result like the heatmap below
(however still a cheap way to simulate a similar result but not for initial annotations)

pheatmap::pheatmap(pheno_neur[,c(paste0("pheno_",c(7,12,22,1,16,20,6,21,4,5,0,2,3,10,23,8,14,17,13,19,24,11,15,18,9)),"count")] %>% 
                     mutate_all(as.numeric), 
                   cluster_rows = F, cluster_cols = F, breaks = seq(0,300,3),
                   display_numbers =T, number_format = '%.0f', labels_row= rownames(pheno_neur))

then we have

subclust_neur <- data.frame(inlocal=paste0(c(paste0("PEMN_",1:7),paste0("PIMN_",1:8),paste0("PIN_",1:4),paste0("PSN_",1:4),paste0("PSVN_",1:2))),
                            raw=paste0("pheno_",c(7,12,22,1,16,20,6,21,4,5,0,2,3,10,23,8,14,17,13,19,24,11,15,18,9)))

pheatmap::pheatmap(pheno_neur[,c(subclust_neur$raw,"count")] %>% 
                     mutate_all(as.numeric), 
                   cluster_rows = F, cluster_cols = F, breaks = seq(0,300,3),
                   display_numbers =T, number_format = '%.0f', labels_row= rownames(pheno_neur),
                   labels_col = c(paste0(subclust_neur$inlocal,"_inlocal"),"count"),
                   gaps_col = c(7,15,19,23,25), gaps_row = c(5,12,15,19,21))

#write.table(pheno_neur,"pheno_neur.ss2.csv",col.names = TRUE,row.names = TRUE,sep = ",",quote = F)

label inlocal ss2_neur sub-clusters

ss2_neur.seur@meta.data$inlocal <- paste0("pheno_",ss2_neur.seur@meta.data$phenograph.k15)
for(i in 1:length(ss2_neur.seur@meta.data$inlocal)){
  ss2_neur.seur@meta.data$inlocal[i] <- subclust_neur$inlocal[subclust_neur$raw == ss2_neur.seur@meta.data$inlocal[i]]
}
DimPlot(ss2_neur.seur, group.by = 'inlocal', label = T) + labs(title = "ss2_neur inlocal")

for ss2_neur sub-clustering, comparing to inpaper annotations,
we get a generally separated result between groups, but broadly mixed within one group, and partly mixed across some groups

DimPlot(ss2_neur.seur, group.by = 'Annotation', label = T) + labs(title = "ss2_neur inpaper")

check

marker pairs for main division into two groups

FeaturePlot(ss2_neur.seur, features = c("Chat","Nos1","Gfra2","Gfra1","Casz1","Etv1"))

group by conditions

multiplot(
DimPlot(ss2_neur.seur, group.by = 'Age',order=(c("NA","11","12","52","104","105")), 
                   cols = rev(c("#312D8F","#312D8F","#7BAF42","#CB452B","#CB452B"))) + 
                labs(title = "Age"),
DimPlot(ss2_neur.seur, group.by = 'Segment',order=(c("NA","7AM","2PM","7PM"))) + labs(title = "Segment"),
DimPlot(ss2_neur.seur, group.by = 'Model') + labs(title = "Model"),
DimPlot(ss2_neur.seur, group.by = 'nGene', cols = material.heat(1909)) + labs(title = "Number of genes")  + 
             guides(colour = guide_coloursteps(show.limits = TRUE,label=FALSE)),
DimPlot(ss2_neur.seur, group.by = 'Sex') + labs(title = "Sex"),
DimPlot(ss2_neur.seur, group.by = 'Time',order=(c("NA","7AM","2PM","7PM"))) + labs(title = "Time"),
cols=3)

by batch(Mouse_ID)

DimPlot(ss2_neur.seur, group.by = 'Mouse_ID', order=paste0("M",as.character(rev(1:29)))) + labs(title = "Mouse") +
                  theme(legend.text=element_text(size=8))

(other analysis for ss2_neur sub-clusters all in another subpage)

ss2_glia

check results

PCs

ElbowPlot(ss2_glia.seur,ndims = 15) + labs(title = "ss2_glia PCs, take 1-9")

PC1&2

DimPlot(ss2_glia.seur, reduction = 'pca', dims = c(1,2)) + labs(title = "ss2_glia PC1+2 ")

DimPlot(ss2_glia.seur, reduction = 'pca', dims = c(1,2), group.by = 'Annotation') + labs(title = "ss2_glia PC1+2  with inpaper annotations")

tSNE

DimPlot(ss2_glia.seur, reduction = 'tsne') + labs(title = "ss2_glia tSNE")

DimPlot(ss2_glia.seur, reduction = 'tsne', group.by = 'Annotation',label = T ) + labs(title = "ss2_glia tSNE  with  inpaper annotations")

Phenograph clusters

DimPlot(ss2_glia.seur, group.by = 'phenograph.k500', label = T) + labs(title = "ss2_glia Phenograph output")

annotation

could see we get a same 3 clusters with a very big 'k' for kNN setting,
one downleft cluster with 99% matched but the other two with 1/5-1/4 mixed

pheno_glia <- ss2_glia.seur@meta.data[c('phenograph.k500','Annotation')]
pheno_glia <- group_by(pheno_glia, phenograph.k500)

pheno_glia <- summarise(pheno_glia,
                         count=n(),
                         Glia_1_inpaper = sum(Annotation == 'Glia_1'),
                         Glia_2_inpaper = sum(Annotation == 'Glia_2'),
                         Glia_3_inpaper = sum(Annotation == 'Glia_3'))

pheno_glia <- rbind(pheno_glia,c("count",colSums(pheno_glia[,2:ncol(pheno_glia)])))
pheno_glia <- data.frame(t(pheno_glia))[c(1,3:5,2),][,c(order(as.numeric(pheno_glia$phenograph.k500[1:3])),4)]
colnames(pheno_glia) <- c(paste0("pheno_",pheno_glia[1,1:ncol(pheno_glia)-1]),"count")
pheno_glia <- pheno_glia[2:nrow(pheno_glia),]
pheno_glia
##                pheno_0 pheno_1 pheno_2 count
## Glia_1_inpaper      22    1009      25  1056
## Glia_2_inpaper     821      54     171  1046
## Glia_3_inpaper     268      11     658   937
## count             1111    1074     854  3039

so, annotate with max count as ss2_neur

pheatmap::pheatmap(pheno_glia[,c(2,1,3,4)] %>%  mutate_all(as.numeric), cluster_rows = F, cluster_cols = F, breaks = seq(0,1200,12),
                   display_numbers =T, number_format = '%.0f', labels_row= rownames(pheno_glia))

less clusters so faster to get

pheatmap::pheatmap(pheno_glia[,c(2,1,3,4)] %>%  mutate_all(as.numeric), cluster_rows = F, cluster_cols = F, breaks = seq(0,1200,12),
                   display_numbers =T, number_format = '%.0f', labels_row= rownames(pheno_glia),
                   labels_col = c(paste0("Glia_",1:3,"_inlocal"),"count"))

label inlocal ss2_glia sub-clusters

ss2_glia.seur@meta.data$inlocal <- ss2_glia.seur@meta.data$phenograph.k500
ss2_glia.seur@meta.data$inlocal[ss2_glia.seur@meta.data$inlocal=="1"] <- "Glia_1"
ss2_glia.seur@meta.data$inlocal[ss2_glia.seur@meta.data$inlocal=="0"] <- "Glia_2"
ss2_glia.seur@meta.data$inlocal[ss2_glia.seur@meta.data$inlocal=="2"] <- "Glia_3"
DimPlot(ss2_glia.seur, group.by = 'inlocal', label = T) + labs(title = "ss2_glia inlocal")

for ss2_glia sub-clustering, comparing to inpaper annotations,
we get mostly matched Glia_1, but partly mixed Glia_2/3

DimPlot(ss2_glia.seur, group.by = 'Annotation', label = T) + labs(title = "ss2_glia inpaper")

check

marker genes

FeaturePlot(ss2_glia.seur, features = c("Gfra2","Slc18a2","Ntsr1"))

group by conditions

multiplot(
DimPlot(ss2_glia.seur, group.by = 'Age',order=(c("NA","11","12","52","104","105")), 
                   cols = rev(c("#312D8F","#312D8F","#7BAF42","#CB452B","#CB452B"))) + 
                labs(title = "Age"),
DimPlot(ss2_glia.seur, group.by = 'Segment',order=(c("NA","7AM","2PM","7PM"))) + labs(title = "Segment"),
DimPlot(ss2_glia.seur, group.by = 'Model') + labs(title = "Model"),
DimPlot(ss2_glia.seur, group.by = 'nGene', cols = material.heat(1909)) + labs(title = "Number of genes")  + 
             guides(colour = guide_coloursteps(show.limits = TRUE,label=FALSE)),
DimPlot(ss2_glia.seur, group.by = 'Sex') + labs(title = "Sex"),
DimPlot(ss2_glia.seur, group.by = 'Time',order=(c("NA","7AM","2PM","7PM"))) + labs(title = "Time"),
cols=3)

by batch(Mouse_ID)

DimPlot(ss2_glia.seur, group.by = 'Mouse_ID', order=paste0("M",as.character(rev(1:29)))) + labs(title = "Mouse") +
                  theme(legend.text=element_text(size=8))

(other analysis for ss2_glia sub-clusters all in another subpage)

#saveRDS(ss2_neur.seur,"ss2_neur.seur.rds")
#saveRDS(ss2_glia.seur,"ss2_glia.seur.rds")